diff --git a/2b855b11bc4seed2/evaluation/generation/merged.csv b/2b855b11bc4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..fe1554f01f6fbfcf9d4dcbd6108b329acce73990 --- /dev/null +++ b/2b855b11bc4seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.05603098508787396 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.05603098508787396 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.11813763333164416 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.11813763333164416 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13757318526100176 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13757318526100176 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14464807696490528 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.14464807696490528 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.14604424103058358 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.14604424103058358 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.15173507276703452 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.15173507276703452 +e2e_nlg_cleaned,5,average,multiple,0.1256948657405072 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.044139249367473786 +gem_xsum,0,median,rouge2_fmeasure,0.044139249367473786 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03540453263680104 +gem_xsum,1,median,rouge2_fmeasure,0.03540453263680104 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.0348723834844472 +gem_xsum,2,median,rouge2_fmeasure,0.0348723834844472 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03171913029877269 +gem_xsum,3,median,rouge2_fmeasure,0.03171913029877269 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.007074958138573533 +gem_xsum,4,median,rouge2_fmeasure,0.007074958138573533 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00012314549854098596 +gem_xsum,5,median,rouge2_fmeasure,0.00012314549854098596 +gem_xsum,5,average,multiple,0.025555566570768205 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04899451308031972 +web_nlg_en,0,median,rouge2_fmeasure,0.04899451308031972 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.050978314953605365 +web_nlg_en,1,median,rouge2_fmeasure,0.050978314953605365 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05107703767051227 +web_nlg_en,2,median,rouge2_fmeasure,0.05107703767051227 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.053756988677712864 +web_nlg_en,3,median,rouge2_fmeasure,0.053756988677712864 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05523496800965618 +web_nlg_en,4,median,rouge2_fmeasure,0.05523496800965618 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054826378486590326 +web_nlg_en,5,median,rouge2_fmeasure,0.054826378486590326 +web_nlg_en,5,average,multiple,0.05247803347973279 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03498414446894612 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03498414446894612 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.044212366766547766 +wiki_lingua_en,1,median,rouge2_fmeasure,0.044212366766547766 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05013925437862328 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05013925437862328 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04125017703720899 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04125017703720899 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01492978001583821 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01492978001583821 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0022181250558980193 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0022181250558980193 +wiki_lingua_en,5,average,multiple,0.0312889746205104 diff --git a/2b855b11bc4seed2/evaluation/generation/merged.json b/2b855b11bc4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..bc72b54b78d160a9eaefcde5d31b542869d46e0d --- /dev/null +++ b/2b855b11bc4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3669019692085673, "bleu_stderr": 0.03735576208579507, "rouge1_fmeasure": 0.10488940315310863, "rouge1_fmeasure_stderr": 0.002182807125149089, "rouge1_precision": 0.07108949670602635, "rouge1_precision_stderr": 0.0018605155075932162, "rouge1_recall": 0.2944008471177708, "rouge1_recall_stderr": 0.00496048505670551, "rouge2_fmeasure": 0.04899451308031972, "rouge2_fmeasure_stderr": 0.0013409522053557752, "rouge2_precision": 0.03282356988116583, "rouge2_precision_stderr": 0.0011424627167812825, "rouge2_recall": 0.14057659944269588, "rouge2_recall_stderr": 0.003256076102514853, "rougeL_fmeasure": 0.10057011768365899, "rougeL_fmeasure_stderr": 0.002005974356548281, "rougeL_precision": 0.06788803543605783, "rougeL_precision_stderr": 0.0017222927384213094, "rougeL_recall": 0.2859195847396627, "rougeL_recall_stderr": 0.00482334950262861, "rougeLsum_fmeasure": 0.0995943061498225, "rougeLsum_fmeasure_stderr": 0.002034666255159939, "rougeLsum_precision": 0.0675573531009672, "rougeLsum_precision_stderr": 0.001764984902165412, "rougeLsum_recall": 0.2797874789377513, "rougeLsum_recall_stderr": 0.004633335830609517}}, "1": {"PALM_prompt": {"bleu": 0.4457796396280548, "bleu_stderr": 0.03663789805602827, "rouge1_fmeasure": 0.11290267432024997, "rouge1_fmeasure_stderr": 0.002086212254260406, "rouge1_precision": 0.07328276677295399, "rouge1_precision_stderr": 0.001595296009075034, "rouge1_recall": 0.357873529297584, "rouge1_recall_stderr": 0.005356216012775593, "rouge2_fmeasure": 0.050978314953605365, "rouge2_fmeasure_stderr": 0.0012648239214954904, "rouge2_precision": 0.032960565662680366, "rouge2_precision_stderr": 0.00092590184441572, "rouge2_recall": 0.1681640544477541, "rouge2_recall_stderr": 0.003577309598153851, "rougeL_fmeasure": 0.10461737503039704, "rougeL_fmeasure_stderr": 0.0018238001476137153, "rougeL_precision": 0.0676147386351206, "rougeL_precision_stderr": 0.0013707383628966691, "rougeL_recall": 0.3328818182866075, "rougeL_recall_stderr": 0.004895092850273884, "rougeLsum_fmeasure": 0.10672422186964665, "rougeLsum_fmeasure_stderr": 0.0019474862069458747, "rougeLsum_precision": 0.06926122229303046, "rougeLsum_precision_stderr": 0.0014823373285525329, "rougeLsum_recall": 0.3364706883571429, "rougeLsum_recall_stderr": 0.004858431069268795}}, "2": {"PALM_prompt": {"bleu": 0.46605282718698626, "bleu_stderr": 0.02996676777558062, "rouge1_fmeasure": 0.11300207247927123, "rouge1_fmeasure_stderr": 0.0019165980987731685, "rouge1_precision": 0.07223534858221925, "rouge1_precision_stderr": 0.0014523280167324517, "rouge1_recall": 0.3709874761998271, "rouge1_recall_stderr": 0.005246128541242479, "rouge2_fmeasure": 0.05107703767051227, "rouge2_fmeasure_stderr": 0.001189241631450374, "rouge2_precision": 0.032619601917078085, "rouge2_precision_stderr": 0.0008732560599076257, "rouge2_recall": 0.17538727298076712, "rouge2_recall_stderr": 0.0036277508794783438, "rougeL_fmeasure": 0.104830081432862, "rougeL_fmeasure_stderr": 0.0017334774718182442, "rougeL_precision": 0.06700314943348495, "rougeL_precision_stderr": 0.0013017430875308894, "rougeL_recall": 0.341280675217576, "rougeL_recall_stderr": 0.004680138815460377, "rougeLsum_fmeasure": 0.10664511244354742, "rougeLsum_fmeasure_stderr": 0.0017935730802929586, "rougeLsum_precision": 0.0682192697317043, "rougeLsum_precision_stderr": 0.0013544348051587117, "rougeLsum_recall": 0.347890761022798, "rougeLsum_recall_stderr": 0.004814945746675548}}, "3": {"PALM_prompt": {"bleu": 0.5506448987587048, "bleu_stderr": 0.028040930117572985, "rouge1_fmeasure": 0.11789825375153909, "rouge1_fmeasure_stderr": 0.001932195619461612, "rouge1_precision": 0.07573922996380036, "rouge1_precision_stderr": 0.0015170397473250185, "rouge1_recall": 0.38492634002858683, "rouge1_recall_stderr": 0.005176207474155607, "rouge2_fmeasure": 0.053756988677712864, "rouge2_fmeasure_stderr": 0.0011965966106312165, "rouge2_precision": 0.03435743387037634, "rouge2_precision_stderr": 0.0008961471238384336, "rouge2_recall": 0.18616259178006056, "rouge2_recall_stderr": 0.0037451350035256482, "rougeL_fmeasure": 0.10879682748659922, "rougeL_fmeasure_stderr": 0.0017295701095651076, "rougeL_precision": 0.06978173508855735, "rougeL_precision_stderr": 0.0013414658300881625, "rougeL_recall": 0.3536412782595861, "rougeL_recall_stderr": 0.004596054949628527, "rougeLsum_fmeasure": 0.11158082524412793, "rougeLsum_fmeasure_stderr": 0.0018211626697717824, "rougeLsum_precision": 0.07172260100588973, "rougeLsum_precision_stderr": 0.0014274208122089145, "rougeLsum_recall": 0.36244343249377947, "rougeLsum_recall_stderr": 0.004772754547214642}}, "4": {"PALM_prompt": {"bleu": 0.5881318297710302, "bleu_stderr": 0.039908239363835654, "rouge1_fmeasure": 0.12053757788822311, "rouge1_fmeasure_stderr": 0.0018961787838997102, "rouge1_precision": 0.07735300568910049, "rouge1_precision_stderr": 0.0015086378866700828, "rouge1_recall": 0.3985419746518129, "rouge1_recall_stderr": 0.005029418888525787, "rouge2_fmeasure": 0.05523496800965618, "rouge2_fmeasure_stderr": 0.0011740287332534603, "rouge2_precision": 0.035199176173288736, "rouge2_precision_stderr": 0.0008620977702919277, "rouge2_recall": 0.19638451832650627, "rouge2_recall_stderr": 0.0037186121018020772, "rougeL_fmeasure": 0.10988623597552064, "rougeL_fmeasure_stderr": 0.0016628282980400671, "rougeL_precision": 0.07030939128233044, "rougeL_precision_stderr": 0.0012828117493288978, "rougeL_recall": 0.36285260508288164, "rougeL_recall_stderr": 0.0044527827738133135, "rougeLsum_fmeasure": 0.11360120534535015, "rougeLsum_fmeasure_stderr": 0.0017628360499995425, "rougeLsum_precision": 0.07278434161088057, "rougeLsum_precision_stderr": 0.0013675084372612996, "rougeLsum_recall": 0.3756859898920006, "rougeLsum_recall_stderr": 0.004687342906896715}}, "5": {"PALM_prompt": {"bleu": 0.6100325324177117, "bleu_stderr": 0.02703478355184964, "rouge1_fmeasure": 0.11969950579850888, "rouge1_fmeasure_stderr": 0.0019228477421699686, "rouge1_precision": 0.07722442196685407, "rouge1_precision_stderr": 0.0016109314348509566, "rouge1_recall": 0.4017267929127603, "rouge1_recall_stderr": 0.005110092029665619, "rouge2_fmeasure": 0.054826378486590326, "rouge2_fmeasure_stderr": 0.0011899938265323704, "rouge2_precision": 0.035279538401147746, "rouge2_precision_stderr": 0.0009640245304949675, "rouge2_recall": 0.19826855938594587, "rouge2_recall_stderr": 0.003837030403723965, "rougeL_fmeasure": 0.1091894265235256, "rougeL_fmeasure_stderr": 0.0017016342146881787, "rougeL_precision": 0.07042983922009842, "rougeL_precision_stderr": 0.0014289861593103398, "rougeL_recall": 0.3644310690944154, "rougeL_recall_stderr": 0.004466552122874037, "rougeLsum_fmeasure": 0.11289648560916707, "rougeLsum_fmeasure_stderr": 0.001777647929316373, "rougeLsum_precision": 0.07279721150667341, "rougeLsum_precision_stderr": 0.0014832472297194098, "rougeLsum_recall": 0.3778701373910634, "rougeLsum_recall_stderr": 0.0046818712318910235}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5110660909639972, "bleu_stderr": 0.07401749441784054, "rouge1_fmeasure": 0.17437063929496616, "rouge1_fmeasure_stderr": 0.0018679203169137393, "rouge1_precision": 0.14841430227531063, "rouge1_precision_stderr": 0.0018849317623447377, "rouge1_recall": 0.2546402796166594, "rouge1_recall_stderr": 0.002706292482883891, "rouge2_fmeasure": 0.03498414446894612, "rouge2_fmeasure_stderr": 0.0008298176462282185, "rouge2_precision": 0.02943881399707129, "rouge2_precision_stderr": 0.0007248235247046832, "rouge2_recall": 0.053608397228254355, "rouge2_recall_stderr": 0.0014462144916590092, "rougeL_fmeasure": 0.13752082994315118, "rougeL_fmeasure_stderr": 0.0013441166414505977, "rougeL_precision": 0.1156713065994801, "rougeL_precision_stderr": 0.0013243962160205257, "rougeL_recall": 0.2058047431206632, "rougeL_recall_stderr": 0.002206261287530996, "rougeLsum_fmeasure": 0.15963577366030096, "rougeLsum_fmeasure_stderr": 0.0017015868824249002, "rougeLsum_precision": 0.13571633871621264, "rougeLsum_precision_stderr": 0.0017186096958220363, "rougeLsum_recall": 0.2340095218448455, "rougeLsum_recall_stderr": 0.002497602615389906}}, "1": {"tldr_en": {"bleu": 2.246682518650239, "bleu_stderr": 0.06775703875806485, "rouge1_fmeasure": 0.20178154664890433, "rouge1_fmeasure_stderr": 0.0019091801402858672, "rouge1_precision": 0.1730537365856929, "rouge1_precision_stderr": 0.0020147050749397054, "rouge1_recall": 0.29341449100333866, "rouge1_recall_stderr": 0.0027275267975047585, "rouge2_fmeasure": 0.044212366766547766, "rouge2_fmeasure_stderr": 0.0009611400325850913, "rouge2_precision": 0.0378044364023744, "rouge2_precision_stderr": 0.0008707368086399108, "rouge2_recall": 0.06598843832993921, "rouge2_recall_stderr": 0.0015965497648305382, "rougeL_fmeasure": 0.14567255875431964, "rougeL_fmeasure_stderr": 0.0012969079642798744, "rougeL_precision": 0.12349613685503863, "rougeL_precision_stderr": 0.0013414079107970274, "rougeL_recall": 0.21768049847327345, "rougeL_recall_stderr": 0.002171417223641686, "rougeLsum_fmeasure": 0.18855005555536836, "rougeLsum_fmeasure_stderr": 0.0017792963187320534, "rougeLsum_precision": 0.16149868456320995, "rougeLsum_precision_stderr": 0.0018740327287461542, "rougeLsum_recall": 0.2751854506345283, "rougeLsum_recall_stderr": 0.0025841619520934733}}, "2": {"tldr_en": {"bleu": 2.5041236014249213, "bleu_stderr": 0.054677815274649495, "rouge1_fmeasure": 0.21020781216935502, "rouge1_fmeasure_stderr": 0.0019360400703538375, "rouge1_precision": 0.1819807478308173, "rouge1_precision_stderr": 0.002091157564590908, "rouge1_recall": 0.3029305112527695, "rouge1_recall_stderr": 0.0027548325089732, "rouge2_fmeasure": 0.05013925437862328, "rouge2_fmeasure_stderr": 0.0009903485147246848, "rouge2_precision": 0.04325856897701525, "rouge2_precision_stderr": 0.000932364650414809, "rouge2_recall": 0.07429867293210415, "rouge2_recall_stderr": 0.0016111887959417725, "rougeL_fmeasure": 0.15515714332881575, "rougeL_fmeasure_stderr": 0.0013346431577317703, "rougeL_precision": 0.13305722535113657, "rougeL_precision_stderr": 0.0014277373936989982, "rougeL_recall": 0.22871882340075878, "rougeL_recall_stderr": 0.0021821233762823362, "rougeLsum_fmeasure": 0.1968617160132362, "rougeLsum_fmeasure_stderr": 0.0018143106376436737, "rougeLsum_precision": 0.17024159081815748, "rougeLsum_precision_stderr": 0.001957592483299715, "rougeLsum_recall": 0.2845967556685176, "rougeLsum_recall_stderr": 0.002621760111154941}}, "3": {"tldr_en": {"bleu": 2.414166858396001, "bleu_stderr": 0.1017204202701052, "rouge1_fmeasure": 0.1772632037488201, "rouge1_fmeasure_stderr": 0.002218928925982093, "rouge1_precision": 0.16185089345243805, "rouge1_precision_stderr": 0.002447244225379841, "rouge1_recall": 0.251517278638771, "rouge1_recall_stderr": 0.003226010774604141, "rouge2_fmeasure": 0.04125017703720899, "rouge2_fmeasure_stderr": 0.0009395476601233144, "rouge2_precision": 0.03709929137355734, "rouge2_precision_stderr": 0.0009517935690523235, "rouge2_recall": 0.061570751938897314, "rouge2_recall_stderr": 0.0015824433764131248, "rougeL_fmeasure": 0.13317965374627014, "rougeL_fmeasure_stderr": 0.0016023423958461526, "rougeL_precision": 0.12128193808204789, "rougeL_precision_stderr": 0.00183443739135039, "rougeL_recall": 0.19343716089196752, "rougeL_recall_stderr": 0.002584189617132911, "rougeLsum_fmeasure": 0.1666081331968882, "rougeLsum_fmeasure_stderr": 0.0020738729000881525, "rougeLsum_precision": 0.15219158593894744, "rougeLsum_precision_stderr": 0.0023055319887651145, "rougeLsum_recall": 0.23707015010427476, "rougeLsum_recall_stderr": 0.003060030836900186}}, "4": {"tldr_en": {"bleu": 0.6242302767315615, "bleu_stderr": 0.04253215138946399, "rouge1_fmeasure": 0.059095668151701794, "rouge1_fmeasure_stderr": 0.0019914573168015462, "rouge1_precision": 0.05589660634771713, "rouge1_precision_stderr": 0.0020789247458225762, "rouge1_recall": 0.0866210676290151, "rouge1_recall_stderr": 0.002970118175189146, "rouge2_fmeasure": 0.01492978001583821, "rouge2_fmeasure_stderr": 0.0007051123182883665, "rouge2_precision": 0.013794438861716635, "rouge2_precision_stderr": 0.0007198996863920006, "rouge2_recall": 0.02323840526022537, "rouge2_recall_stderr": 0.0012081229235487365, "rougeL_fmeasure": 0.04527379830332456, "rougeL_fmeasure_stderr": 0.0014982426681793449, "rougeL_precision": 0.04287508113995099, "rougeL_precision_stderr": 0.00159737758143571, "rougeL_recall": 0.06772654959144096, "rougeL_recall_stderr": 0.002355177689293077, "rougeLsum_fmeasure": 0.055240332031071544, "rougeLsum_fmeasure_stderr": 0.001858674299324512, "rougeLsum_precision": 0.05244988982418966, "rougeLsum_precision_stderr": 0.0019680061386683676, "rougeLsum_recall": 0.0810586452425775, "rougeLsum_recall_stderr": 0.002782405355135347}}, "5": {"tldr_en": {"bleu": 1.615827815577086e-06, "bleu_stderr": 3.2333552495423247e-06, "rouge1_fmeasure": 0.009318253326635167, "rouge1_fmeasure_stderr": 0.000866152474725978, "rouge1_precision": 0.00946125436925863, "rouge1_precision_stderr": 0.0009815109693569397, "rouge1_recall": 0.013468009711675888, "rouge1_recall_stderr": 0.001269746421645073, "rouge2_fmeasure": 0.0022181250558980193, "rouge2_fmeasure_stderr": 0.0002700454897219965, "rouge2_precision": 0.0020886287050252525, "rouge2_precision_stderr": 0.00026488805810467837, "rouge2_recall": 0.0034252359070913153, "rouge2_recall_stderr": 0.0004431407661930414, "rougeL_fmeasure": 0.007216579385654348, "rougeL_fmeasure_stderr": 0.0006514725881035319, "rougeL_precision": 0.007442143655450109, "rougeL_precision_stderr": 0.000775111822355702, "rougeL_recall": 0.010621704812097997, "rougeL_recall_stderr": 0.0010052653546245714, "rougeLsum_fmeasure": 0.008729835862025952, "rougeLsum_fmeasure_stderr": 0.0008108342616344646, "rougeLsum_precision": 0.008854227589439165, "rougeLsum_precision_stderr": 0.0009238074004769046, "rougeLsum_recall": 0.012656577138490685, "rougeLsum_recall_stderr": 0.0011982797701887724}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.0378761453257783, "bleu_stderr": 0.08518203279465711, "rouge1_fmeasure": 0.15284466517030987, "rouge1_fmeasure_stderr": 0.0020733229264210833, "rouge1_precision": 0.12346238513217829, "rouge1_precision_stderr": 0.0018953361220130477, "rouge1_recall": 0.22095983282063245, "rouge1_recall_stderr": 0.0028901633809294567, "rouge2_fmeasure": 0.05603098508787396, "rouge2_fmeasure_stderr": 0.001104580558808195, "rouge2_precision": 0.04442101205623975, "rouge2_precision_stderr": 0.0009174859090253856, "rouge2_recall": 0.08189624776685947, "rouge2_recall_stderr": 0.0016402680486423498, "rougeL_fmeasure": 0.14424875152214156, "rougeL_fmeasure_stderr": 0.0018997489713444592, "rougeL_precision": 0.11579616240640865, "rougeL_precision_stderr": 0.0016878357390951878, "rougeL_recall": 0.21005341644582798, "rougeL_recall_stderr": 0.0027249280433669612, "rougeLsum_fmeasure": 0.13266404353961783, "rougeLsum_fmeasure_stderr": 0.0018444407314532878, "rougeLsum_precision": 0.10723369718745066, "rougeLsum_precision_stderr": 0.0016785659862114447, "rougeLsum_recall": 0.19175382965940935, "rougeLsum_recall_stderr": 0.002581731401648066}}, "1": {"generate_text_restaurant": {"bleu": 5.126169908039335, "bleu_stderr": 0.060019707849600396, "rouge1_fmeasure": 0.30084375583488465, "rouge1_fmeasure_stderr": 0.0017666196438528153, "rouge1_precision": 0.2374397743893787, "rouge1_precision_stderr": 0.001827366166961729, "rouge1_recall": 0.4538895860871574, "rouge1_recall_stderr": 0.002739145525628191, "rouge2_fmeasure": 0.11813763333164416, "rouge2_fmeasure_stderr": 0.001265066374684441, "rouge2_precision": 0.09311186153520964, "rouge2_precision_stderr": 0.0012117859139284565, "rouge2_recall": 0.18237560978508383, "rouge2_recall_stderr": 0.0020325975329361454, "rougeL_fmeasure": 0.24433694694761532, "rougeL_fmeasure_stderr": 0.00130346416067227, "rougeL_precision": 0.19149503022784875, "rougeL_precision_stderr": 0.0013376484927668058, "rougeL_recall": 0.3734418164524156, "rougeL_recall_stderr": 0.0023461065260632426, "rougeLsum_fmeasure": 0.24256192147463654, "rougeLsum_fmeasure_stderr": 0.0016652743992374153, "rougeLsum_precision": 0.19159105345376767, "rougeLsum_precision_stderr": 0.0016482031013163958, "rougeLsum_recall": 0.3662200697989403, "rougeLsum_recall_stderr": 0.0026084151410531906}}, "2": {"generate_text_restaurant": {"bleu": 6.249384385334651, "bleu_stderr": 0.07770725715172515, "rouge1_fmeasure": 0.3236620882311894, "rouge1_fmeasure_stderr": 0.001730869351887234, "rouge1_precision": 0.2571488513560848, "rouge1_precision_stderr": 0.0017479087453492585, "rouge1_recall": 0.4776474806351213, "rouge1_recall_stderr": 0.0027381915088070803, "rouge2_fmeasure": 0.13757318526100176, "rouge2_fmeasure_stderr": 0.0013355463995447833, "rouge2_precision": 0.1085707308611601, "rouge2_precision_stderr": 0.0012335636842772677, "rouge2_recall": 0.20861118989211233, "rouge2_recall_stderr": 0.0021888224556685905, "rougeL_fmeasure": 0.265325935238766, "rougeL_fmeasure_stderr": 0.001349636506785451, "rougeL_precision": 0.21005072572789943, "rougeL_precision_stderr": 0.0013866447459134966, "rougeL_recall": 0.39533407328249814, "rougeL_recall_stderr": 0.0024077424449100897, "rougeLsum_fmeasure": 0.2652383774880513, "rougeLsum_fmeasure_stderr": 0.0016824765629729075, "rougeLsum_precision": 0.21084881314164047, "rougeLsum_precision_stderr": 0.0016535970510521015, "rougeLsum_recall": 0.39227019176650024, "rougeLsum_recall_stderr": 0.0026723625566665876}}, "3": {"generate_text_restaurant": {"bleu": 6.757816195559476, "bleu_stderr": 0.0636105382851002, "rouge1_fmeasure": 0.3311529384971786, "rouge1_fmeasure_stderr": 0.0017193353624414865, "rouge1_precision": 0.2649025981125516, "rouge1_precision_stderr": 0.001816521271723974, "rouge1_recall": 0.485500501669586, "rouge1_recall_stderr": 0.0027035556710176397, "rouge2_fmeasure": 0.14464807696490528, "rouge2_fmeasure_stderr": 0.0013667792152310628, "rouge2_precision": 0.1156613222917535, "rouge2_precision_stderr": 0.00136914737695428, "rouge2_recall": 0.2172590676961178, "rouge2_recall_stderr": 0.0021909769319400805, "rougeL_fmeasure": 0.27382376721341406, "rougeL_fmeasure_stderr": 0.0013534412847209798, "rougeL_precision": 0.21833801769994254, "rougeL_precision_stderr": 0.0014604899732955515, "rougeL_recall": 0.40505182586725, "rougeL_recall_stderr": 0.0023977022968250496, "rougeLsum_fmeasure": 0.27330735067492545, "rougeLsum_fmeasure_stderr": 0.0016914308676584363, "rougeLsum_precision": 0.2188141627190453, "rougeLsum_precision_stderr": 0.0017198980966469132, "rougeLsum_recall": 0.40131515523410427, "rougeLsum_recall_stderr": 0.0026468309652264163}}, "4": {"generate_text_restaurant": {"bleu": 6.8116932588036025, "bleu_stderr": 0.08803986085496286, "rouge1_fmeasure": 0.33109808130241025, "rouge1_fmeasure_stderr": 0.001743234134850443, "rouge1_precision": 0.2692618389240567, "rouge1_precision_stderr": 0.0019668736185400296, "rouge1_recall": 0.4771107096079804, "rouge1_recall_stderr": 0.0026181488455626965, "rouge2_fmeasure": 0.14604424103058358, "rouge2_fmeasure_stderr": 0.0013887223383308772, "rouge2_precision": 0.11873708025959091, "rouge2_precision_stderr": 0.001398837418169464, "rouge2_recall": 0.21498597349348783, "rouge2_recall_stderr": 0.002140162295467893, "rougeL_fmeasure": 0.27557083918753333, "rougeL_fmeasure_stderr": 0.0013751463805022712, "rougeL_precision": 0.22307248979456698, "rougeL_precision_stderr": 0.0015556609206750656, "rougeL_recall": 0.40100397544077326, "rougeL_recall_stderr": 0.002350965909261926, "rougeLsum_fmeasure": 0.27488278035950847, "rougeLsum_fmeasure_stderr": 0.0017432716698613845, "rougeLsum_precision": 0.22375069182479146, "rougeLsum_precision_stderr": 0.0018651442719424599, "rougeLsum_recall": 0.39667186960697276, "rougeLsum_recall_stderr": 0.002625562673886146}}, "5": {"generate_text_restaurant": {"bleu": 7.123004030201217, "bleu_stderr": 0.10755431721392276, "rouge1_fmeasure": 0.34009425919881386, "rouge1_fmeasure_stderr": 0.0018127528383437095, "rouge1_precision": 0.283387194738878, "rouge1_precision_stderr": 0.0021889859805422174, "rouge1_recall": 0.47762279214889136, "rouge1_recall_stderr": 0.002598204572656884, "rouge2_fmeasure": 0.15173507276703452, "rouge2_fmeasure_stderr": 0.0014423284554178045, "rouge2_precision": 0.1268317618878522, "rouge2_precision_stderr": 0.001539845631916355, "rouge2_recall": 0.21704921295074978, "rouge2_recall_stderr": 0.00213381640113868, "rougeL_fmeasure": 0.28167675573246964, "rougeL_fmeasure_stderr": 0.0014174528695001733, "rougeL_precision": 0.23348226143885964, "rougeL_precision_stderr": 0.0017364200503098236, "rougeL_recall": 0.4000256021331328, "rougeL_recall_stderr": 0.0023393501528240785, "rougeLsum_fmeasure": 0.2841776706521971, "rougeLsum_fmeasure_stderr": 0.0017988587175840989, "rougeLsum_precision": 0.23733946015237703, "rougeLsum_precision_stderr": 0.0020776364265581017, "rougeLsum_recall": 0.3992233197709026, "rougeLsum_recall_stderr": 0.002574196241305113}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9362576802818214, "bleu_stderr": 0.07613560025591426, "rouge1_fmeasure": 0.20774854380832464, "rouge1_fmeasure_stderr": 0.002559300503441153, "rouge1_precision": 0.1555958457764852, "rouge1_precision_stderr": 0.0021160845746587115, "rouge1_recall": 0.34158447671046027, "rouge1_recall_stderr": 0.004323062909638541, "rouge2_fmeasure": 0.044139249367473786, "rouge2_fmeasure_stderr": 0.0015460831827267061, "rouge2_precision": 0.032455922926097835, "rouge2_precision_stderr": 0.0011708356708942493, "rouge2_recall": 0.07494285935393781, "rouge2_recall_stderr": 0.0026689396207543992, "rougeL_fmeasure": 0.15374732499060603, "rougeL_fmeasure_stderr": 0.0019079112053866937, "rougeL_precision": 0.11511600394721422, "rougeL_precision_stderr": 0.001594627834760935, "rougeL_recall": 0.25430483568408224, "rougeL_recall_stderr": 0.003345749633090871, "rougeLsum_fmeasure": 0.16253188243527864, "rougeLsum_fmeasure_stderr": 0.002156777336291824, "rougeLsum_precision": 0.1213281516988994, "rougeLsum_precision_stderr": 0.0017356602287848383, "rougeLsum_recall": 0.26933797612929455, "rougeLsum_recall_stderr": 0.003780314309352455}}, "1": {"article_DOC_summary": {"bleu": 1.4112544524316093, "bleu_stderr": 0.08496214042954833, "rouge1_fmeasure": 0.17760255904564273, "rouge1_fmeasure_stderr": 0.0024690797538052664, "rouge1_precision": 0.126369802264852, "rouge1_precision_stderr": 0.0018405489799639353, "rouge1_recall": 0.31122732461789854, "rouge1_recall_stderr": 0.004193666873674977, "rouge2_fmeasure": 0.03540453263680104, "rouge2_fmeasure_stderr": 0.0013817982389489629, "rouge2_precision": 0.024966878604215887, "rouge2_precision_stderr": 0.0009821028802625116, "rouge2_recall": 0.06352834342639134, "rouge2_recall_stderr": 0.002525805879626373, "rougeL_fmeasure": 0.13621833396997876, "rougeL_fmeasure_stderr": 0.0017888518922440696, "rougeL_precision": 0.09670754836321287, "rougeL_precision_stderr": 0.0013239157948012734, "rougeL_recall": 0.2405099629356952, "rougeL_recall_stderr": 0.0031904466101414556, "rougeLsum_fmeasure": 0.1430983873534407, "rougeLsum_fmeasure_stderr": 0.002024072232741044, "rougeLsum_precision": 0.10153767215572433, "rougeLsum_precision_stderr": 0.0014859568414341794, "rougeLsum_recall": 0.2527563825359106, "rougeLsum_recall_stderr": 0.0036111698335435823}}, "2": {"article_DOC_summary": {"bleu": 1.3507302866689068, "bleu_stderr": 0.08510431402948973, "rouge1_fmeasure": 0.17585370691736185, "rouge1_fmeasure_stderr": 0.0023925094764087468, "rouge1_precision": 0.1246367268120949, "rouge1_precision_stderr": 0.001773757664330906, "rouge1_recall": 0.31045490731458164, "rouge1_recall_stderr": 0.004084886636922553, "rouge2_fmeasure": 0.0348723834844472, "rouge2_fmeasure_stderr": 0.0013509469241876884, "rouge2_precision": 0.024475004166713072, "rouge2_precision_stderr": 0.0009533956524831488, "rouge2_recall": 0.06310551153540954, "rouge2_recall_stderr": 0.002485011154241004, "rougeL_fmeasure": 0.13534716420373505, "rougeL_fmeasure_stderr": 0.001731586302015437, "rougeL_precision": 0.09570739798294506, "rougeL_precision_stderr": 0.0012693669937521557, "rougeL_recall": 0.24064083206853418, "rougeL_recall_stderr": 0.003131371167507212, "rougeLsum_fmeasure": 0.14114161105688136, "rougeLsum_fmeasure_stderr": 0.001994499763031248, "rougeLsum_precision": 0.0998302330407736, "rougeLsum_precision_stderr": 0.0014584981746615054, "rougeLsum_recall": 0.250757479147429, "rougeLsum_recall_stderr": 0.0035444274589021135}}, "3": {"article_DOC_summary": {"bleu": 1.2479004603054775, "bleu_stderr": 0.10042150687708706, "rouge1_fmeasure": 0.16561635893672155, "rouge1_fmeasure_stderr": 0.0025006010127745777, "rouge1_precision": 0.1198460419958428, "rouge1_precision_stderr": 0.0019417773960241746, "rouge1_recall": 0.28812863730867544, "rouge1_recall_stderr": 0.00435221479942355, "rouge2_fmeasure": 0.03171913029877269, "rouge2_fmeasure_stderr": 0.0012890667321138701, "rouge2_precision": 0.022378408876613878, "rouge2_precision_stderr": 0.0009076721878497753, "rouge2_recall": 0.05703336298601347, "rouge2_recall_stderr": 0.002389320400830883, "rougeL_fmeasure": 0.12816630446404473, "rougeL_fmeasure_stderr": 0.0018586126201792224, "rougeL_precision": 0.0923882446291546, "rougeL_precision_stderr": 0.001403733929268059, "rougeL_recall": 0.2245336745602999, "rougeL_recall_stderr": 0.003387032194341533, "rougeLsum_fmeasure": 0.1325551184027993, "rougeLsum_fmeasure_stderr": 0.002071515875578705, "rougeLsum_precision": 0.09550627197662054, "rougeLsum_precision_stderr": 0.0015531216409860379, "rougeLsum_recall": 0.23229017215385045, "rougeLsum_recall_stderr": 0.003730217543869211}}, "4": {"article_DOC_summary": {"bleu": 0.47215686015269726, "bleu_stderr": 0.08574887203656281, "rouge1_fmeasure": 0.04413918217227094, "rouge1_fmeasure_stderr": 0.002480156364042011, "rouge1_precision": 0.03751349089373983, "rouge1_precision_stderr": 0.0022800813493760543, "rouge1_recall": 0.0697871819923432, "rouge1_recall_stderr": 0.0040381330725425294, "rouge2_fmeasure": 0.007074958138573533, "rouge2_fmeasure_stderr": 0.0006827311475169871, "rouge2_precision": 0.005207350657842113, "rouge2_precision_stderr": 0.0005060299660598674, "rouge2_recall": 0.012175067651071106, "rouge2_recall_stderr": 0.0012112660495086575, "rougeL_fmeasure": 0.03395950866138848, "rougeL_fmeasure_stderr": 0.0018808375195373738, "rougeL_precision": 0.029249072317561912, "rougeL_precision_stderr": 0.0018195030363634698, "rougeL_recall": 0.053925106681511445, "rougeL_recall_stderr": 0.003092654840712885, "rougeLsum_fmeasure": 0.035243014423520146, "rougeLsum_fmeasure_stderr": 0.0019800538937599426, "rougeLsum_precision": 0.03021312469117469, "rougeLsum_precision_stderr": 0.0018780320378658164, "rougeLsum_recall": 0.05599613612313265, "rougeLsum_recall_stderr": 0.0032612293404024133}}, "5": {"article_DOC_summary": {"bleu": 1.2727719743841858e-37, "bleu_stderr": 1.2076447049036407e-32, "rouge1_fmeasure": 0.002321071661055069, "rouge1_fmeasure_stderr": 0.0006408973331857279, "rouge1_precision": 0.0026069444960735337, "rouge1_precision_stderr": 0.0007388856823274387, "rouge1_recall": 0.0021572935558770803, "rouge1_recall_stderr": 0.0005864279242216826, "rouge2_fmeasure": 0.00012314549854098596, "rouge2_fmeasure_stderr": 7.122462679033626e-05, "rouge2_precision": 0.00014854426619132502, "rouge2_precision_stderr": 8.571900312776038e-05, "rouge2_recall": 0.00010649727630859706, "rouge2_recall_stderr": 6.211817611366337e-05, "rougeL_fmeasure": 0.0016508273325058706, "rougeL_fmeasure_stderr": 0.00043880923309574715, "rougeL_precision": 0.0018449802658868611, "rougeL_precision_stderr": 0.0005037469801884364, "rougeL_recall": 0.0015501254308975295, "rougeL_recall_stderr": 0.0004091731091328618, "rougeLsum_fmeasure": 0.0016438057763284056, "rougeLsum_fmeasure_stderr": 0.00044670243892416974, "rougeLsum_precision": 0.0018421775438832511, "rougeLsum_precision_stderr": 0.0005163110280087291, "rougeLsum_recall": 0.0015410499501239353, "rougeLsum_recall_stderr": 0.0004141584827452071}}}} \ No newline at end of file diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0.csv b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..c14cb49b54871fb8c1a97cffb5f0e118da9ff8cf --- /dev/null +++ b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.01483050720454104,0 +anli_r2,acc,0.332,0.014899597242811475,0 +anli_r3,acc,0.3308333333333333,0.013588208070709002,0 +arc_challenge,acc,0.23720136518771331,0.012430399829260842,0 +arc_challenge,acc_norm,0.2721843003412969,0.013006600406423704,0 +arc_easy,acc,0.5753367003367004,0.01014265368748041,0 +arc_easy,acc_norm,0.49873737373737376,0.010259750807991153,0 +boolq,acc,0.5559633027522936,0.008690105214920793,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.30718954248366015,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4340768771161123,0.0049462215121452765,0 +hellaswag,acc_norm,0.5581557458673571,0.004955914693717967,0 +piqa,acc,0.7328618063112078,0.01032344049261244,0 +piqa,acc_norm,0.735582154515778,0.010289787244767158,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.813,0.01233625482807413,0 +sciq,acc_norm,0.724,0.014142984975740666,0 +storycloze_2016,acc,0.6873329770176376,0.010720223172953174,0 +winogrande,acc,0.5627466456195738,0.013941393310695924,0 diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index 0aa252ec04c6cbe869a0cfbe87b8f70897df21e7..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.01483050720454104 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811475 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709002 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.30718954248366015 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4340768771161123, - "acc_stderr": 0.0049462215121452765, - "acc_norm": 0.5581557458673571, - "acc_norm_stderr": 0.004955914693717967 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5627466456195738, - "acc_stderr": 0.013941393310695924 - }, - "storycloze_2016": { - "acc": 0.6873329770176376, - "acc_stderr": 0.010720223172953174 - }, - "boolq": { - "acc": 0.5559633027522936, - "acc_stderr": 0.008690105214920793 - }, - "arc_easy": { - "acc": 0.5753367003367004, - "acc_stderr": 0.01014265368748041, - "acc_norm": 0.49873737373737376, - "acc_norm_stderr": 0.010259750807991153 - }, - "arc_challenge": { - "acc": 0.23720136518771331, - "acc_stderr": 0.012430399829260842, - "acc_norm": 0.2721843003412969, - "acc_norm_stderr": 0.013006600406423704 - }, - "sciq": { - "acc": 0.813, - "acc_stderr": 0.01233625482807413, - "acc_norm": 0.724, - "acc_norm_stderr": 0.014142984975740666 - }, - "piqa": { - "acc": 0.7328618063112078, - "acc_stderr": 0.01032344049261244, - "acc_norm": 0.735582154515778, - "acc_norm_stderr": 0.010289787244767158 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1.csv b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..9927f485fcc21f32991d544a23c8a760f15de6b4 --- /dev/null +++ b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.356,0.015149042659306628,0 +anli_r2,acc,0.332,0.014899597242811483,0 +anli_r3,acc,0.34833333333333333,0.01375943749887408,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.2935153583617747,0.013307250444941127,0 +arc_easy,acc,0.5728114478114478,0.010150415974210868,0 +arc_easy,acc_norm,0.5256734006734006,0.010246249665591215,0 +boolq,acc,0.5758409785932722,0.00864386902338812,1 +cb,acc,0.5892857142857143,0.0663363415035954,1 +cb,f1,0.4111718275652702,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4314877514439355,0.004942716091996078,0 +hellaswag,acc_norm,0.5596494722166899,0.004954146286513344,0 +piqa,acc,0.7323177366702938,0.01033011118937043,0 +piqa,acc_norm,0.7334058759521219,0.010316749863541365,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.842,0.011539894677559562,0 +sciq,acc_norm,0.812,0.01236158601510375,0 +storycloze_2016,acc,0.6835916622127205,0.010754780097940887,0 +winogrande,acc,0.56353591160221,0.013938569465677023,0 diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json deleted file mode 100644 index 41b4970e256ddc29a3d67ccaa7b40d0edd852412..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.356, - "acc_stderr": 0.015149042659306628 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811483 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.01375943749887408 - }, - "cb": { - "acc": 0.5892857142857143, - "acc_stderr": 0.0663363415035954, - "f1": 0.4111718275652702 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4314877514439355, - "acc_stderr": 0.004942716091996078, - "acc_norm": 0.5596494722166899, - "acc_norm_stderr": 0.004954146286513344 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.56353591160221, - "acc_stderr": 0.013938569465677023 - }, - "storycloze_2016": { - "acc": 0.6835916622127205, - "acc_stderr": 0.010754780097940887 - }, - "boolq": { - "acc": 0.5758409785932722, - "acc_stderr": 0.00864386902338812 - }, - "arc_easy": { - "acc": 0.5728114478114478, - "acc_stderr": 0.010150415974210868, - "acc_norm": 0.5256734006734006, - "acc_norm_stderr": 0.010246249665591215 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.2935153583617747, - "acc_norm_stderr": 0.013307250444941127 - }, - "sciq": { - "acc": 0.842, - "acc_stderr": 0.011539894677559562, - "acc_norm": 0.812, - "acc_norm_stderr": 0.01236158601510375 - }, - "piqa": { - "acc": 0.7323177366702938, - "acc_stderr": 0.01033011118937043, - "acc_norm": 0.7334058759521219, - "acc_norm_stderr": 0.010316749863541365 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2.csv b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..e4593e28385b0abf80135f0ba667856486a1a1d1 --- /dev/null +++ b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.343,0.015019206922356953,0 +anli_r2,acc,0.335,0.014933117490932573,0 +anli_r3,acc,0.34833333333333333,0.013759437498874066,0 +arc_challenge,acc,0.25853242320819114,0.012794553754288686,0 +arc_challenge,acc_norm,0.2738907849829352,0.013032004972989503,0 +arc_easy,acc,0.5841750841750841,0.01011334824464787,0 +arc_easy,acc_norm,0.5555555555555556,0.010196254838691682,0 +boolq,acc,0.5767584097859327,0.008641391399113586,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.2908054169636493,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.4312885879306911,0.004942440746328496,0 +hellaswag,acc_norm,0.5573590918143796,0.004956839256162738,0 +piqa,acc,0.735038084874864,0.010296557993316056,0 +piqa,acc_norm,0.7388465723612623,0.010248738649935592,0 +rte,acc,0.5523465703971119,0.029931070362939526,0 +sciq,acc,0.862,0.010912152632504401,0 +sciq,acc_norm,0.838,0.011657267771304413,0 +storycloze_2016,acc,0.6878674505611972,0.010715220346279683,0 +winogrande,acc,0.5698500394632992,0.013914685094716696,0 diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index 24dcd794d36671b4a7fb09a204e3b1c93754c09b..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356953 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874066 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.2908054169636493 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.4312885879306911, - "acc_stderr": 0.004942440746328496, - "acc_norm": 0.5573590918143796, - "acc_norm_stderr": 0.004956839256162738 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5698500394632992, - "acc_stderr": 0.013914685094716696 - }, - "storycloze_2016": { - "acc": 0.6878674505611972, - "acc_stderr": 0.010715220346279683 - }, - "boolq": { - "acc": 0.5767584097859327, - "acc_stderr": 0.008641391399113586 - }, - "arc_easy": { - "acc": 0.5841750841750841, - "acc_stderr": 0.01011334824464787, - "acc_norm": 0.5555555555555556, - "acc_norm_stderr": 0.010196254838691682 - }, - "arc_challenge": { - "acc": 0.25853242320819114, - "acc_stderr": 0.012794553754288686, - "acc_norm": 0.2738907849829352, - "acc_norm_stderr": 0.013032004972989503 - }, - "sciq": { - "acc": 0.862, - "acc_stderr": 0.010912152632504401, - "acc_norm": 0.838, - "acc_norm_stderr": 0.011657267771304413 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316056, - "acc_norm": 0.7388465723612623, - "acc_norm_stderr": 0.010248738649935592 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3.csv b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..eba8083a57919422a997c8957fa76752999084bd --- /dev/null +++ b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795021,0 +anli_r2,acc,0.351,0.015100563798316403,0 +anli_r3,acc,0.35333333333333333,0.013804572162314928,0 +arc_challenge,acc,0.25853242320819114,0.012794553754288687,0 +arc_challenge,acc_norm,0.2645051194539249,0.012889272949313368,0 +arc_easy,acc,0.5837542087542088,0.010114819404500873,0 +arc_easy,acc_norm,0.5686026936026936,0.010162752847747506,0 +boolq,acc,0.5694189602446483,0.00866036014598874,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.35846267553584626,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4303923521210914,0.004941191607317909,0 +hellaswag,acc_norm,0.5641306512646883,0.004948567856373861,0 +piqa,acc,0.735038084874864,0.010296557993316056,0 +piqa,acc_norm,0.7421109902067464,0.010206956662056245,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.862,0.010912152632504397,0 +sciq,acc_norm,0.845,0.011450157470799456,0 +storycloze_2016,acc,0.6857295563869589,0.01073513228510818,0 +winogrande,acc,0.5714285714285714,0.013908353814606693,0 diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json deleted file mode 100644 index d695f7a1f42d798408ca4fbfba30d6fd4ac11c31..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795021 - }, - "anli_r2": { - "acc": 0.351, - "acc_stderr": 0.015100563798316403 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314928 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.35846267553584626 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4303923521210914, - "acc_stderr": 0.004941191607317909, - "acc_norm": 0.5641306512646883, - "acc_norm_stderr": 0.004948567856373861 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5714285714285714, - "acc_stderr": 0.013908353814606693 - }, - "storycloze_2016": { - "acc": 0.6857295563869589, - "acc_stderr": 0.01073513228510818 - }, - "boolq": { - "acc": 0.5694189602446483, - "acc_stderr": 0.00866036014598874 - }, - "arc_easy": { - "acc": 0.5837542087542088, - "acc_stderr": 0.010114819404500873, - "acc_norm": 0.5686026936026936, - "acc_norm_stderr": 0.010162752847747506 - }, - "arc_challenge": { - "acc": 0.25853242320819114, - "acc_stderr": 0.012794553754288687, - "acc_norm": 0.2645051194539249, - "acc_norm_stderr": 0.012889272949313368 - }, - "sciq": { - "acc": 0.862, - "acc_stderr": 0.010912152632504397, - "acc_norm": 0.845, - "acc_norm_stderr": 0.011450157470799456 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316056, - "acc_norm": 0.7421109902067464, - "acc_norm_stderr": 0.010206956662056245 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4.csv b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..f354039aba75b3f3f78db53780587ac7d011a12a --- /dev/null +++ b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.367,0.015249378464171749,0 +anli_r2,acc,0.355,0.015139491543780532,0 +anli_r3,acc,0.35583333333333333,0.01382651874849331,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207708,0 +arc_challenge,acc_norm,0.2832764505119454,0.013167478735134576,0 +arc_easy,acc,0.5917508417508418,0.010085566195791245,0 +arc_easy,acc_norm,0.5669191919191919,0.010167478013701789,0 +boolq,acc,0.5724770642201835,0.008652692997177337,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.3175,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.4297948615813583,0.004940349676769324,0 +hellaswag,acc_norm,0.5615415255925115,0.0049518409782196935,0 +piqa,acc,0.7295973884657236,0.010363167031620798,0 +piqa,acc_norm,0.733949945593036,0.010310039263352826,0 +rte,acc,0.5487364620938628,0.029953149241808946,0 +sciq,acc,0.874,0.010499249222408047,0 +sciq,acc_norm,0.853,0.011203415395160328,0 +storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0 +winogrande,acc,0.5666929755327546,0.013926915052757345,0 diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index b09c48b4f02775ec3ca23d7dbfeeac38a001c188..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.367, - "acc_stderr": 0.015249378464171749 - }, - "anli_r2": { - "acc": 0.355, - "acc_stderr": 0.015139491543780532 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.01382651874849331 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644647, - "f1": 0.3175 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.4297948615813583, - "acc_stderr": 0.004940349676769324, - "acc_norm": 0.5615415255925115, - "acc_norm_stderr": 0.0049518409782196935 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808946 - }, - "winogrande": { - "acc": 0.5666929755327546, - "acc_stderr": 0.013926915052757345 - }, - "storycloze_2016": { - "acc": 0.6932121859967931, - "acc_stderr": 0.010664275190473634 - }, - "boolq": { - "acc": 0.5724770642201835, - "acc_stderr": 0.008652692997177337 - }, - "arc_easy": { - "acc": 0.5917508417508418, - "acc_stderr": 0.010085566195791245, - "acc_norm": 0.5669191919191919, - "acc_norm_stderr": 0.010167478013701789 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207708, - "acc_norm": 0.2832764505119454, - "acc_norm_stderr": 0.013167478735134576 - }, - "sciq": { - "acc": 0.874, - "acc_stderr": 0.010499249222408047, - "acc_norm": 0.853, - "acc_norm_stderr": 0.011203415395160328 - }, - "piqa": { - "acc": 0.7295973884657236, - "acc_stderr": 0.010363167031620798, - "acc_norm": 0.733949945593036, - "acc_norm_stderr": 0.010310039263352826 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5.csv b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..41abea9f525e568b9cd75427d6482559a1267d49 --- /dev/null +++ b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.385,0.015395194445410808,0 +anli_r2,acc,0.336,0.014944140233795021,0 +anli_r3,acc,0.36333333333333334,0.013889898953170563,0 +arc_challenge,acc,0.257679180887372,0.012780770562768402,0 +arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0 +arc_easy,acc,0.5904882154882155,0.010090368160990059,0 +arc_easy,acc_norm,0.5736531986531986,0.01014785860383514,0 +boolq,acc,0.5666666666666667,0.008666972565214514,1 +cb,acc,0.5535714285714286,0.06703189227942394,1 +cb,f1,0.3077154912597951,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4266082453694483,0.0049357353003488666,0 +hellaswag,acc_norm,0.566620195180243,0.004945291270072436,0 +piqa,acc,0.7285092491838956,0.010376251176596135,0 +piqa,acc_norm,0.7486398258977149,0.01012115601681925,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.872,0.010570133761108665,0 +sciq,acc_norm,0.854,0.0111717862854965,0 +storycloze_2016,acc,0.6841261357562801,0.010749892827011113,0 +winogrande,acc,0.5445935280189423,0.013996485037729782,0 diff --git a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json b/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json deleted file mode 100644 index 5a6a6a6224b82bb36323d0d1e3bd743d9f81acbd..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed2/evaluation/rankeval/2b855b11bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.385, - "acc_stderr": 0.015395194445410808 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795021 - }, - "anli_r3": { - "acc": 0.36333333333333334, - "acc_stderr": 0.013889898953170563 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942394, - "f1": 0.3077154912597951 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4266082453694483, - "acc_stderr": 0.0049357353003488666, - "acc_norm": 0.566620195180243, - "acc_norm_stderr": 0.004945291270072436 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5445935280189423, - "acc_stderr": 0.013996485037729782 - }, - "storycloze_2016": { - "acc": 0.6841261357562801, - "acc_stderr": 0.010749892827011113 - }, - "boolq": { - "acc": 0.5666666666666667, - "acc_stderr": 0.008666972565214514 - }, - "arc_easy": { - "acc": 0.5904882154882155, - "acc_stderr": 0.010090368160990059, - "acc_norm": 0.5736531986531986, - "acc_norm_stderr": 0.01014785860383514 - }, - "arc_challenge": { - "acc": 0.257679180887372, - "acc_stderr": 0.012780770562768402, - "acc_norm": 0.27559726962457337, - "acc_norm_stderr": 0.013057169655761838 - }, - "sciq": { - "acc": 0.872, - "acc_stderr": 0.010570133761108665, - "acc_norm": 0.854, - "acc_norm_stderr": 0.0111717862854965 - }, - "piqa": { - "acc": 0.7285092491838956, - "acc_stderr": 0.010376251176596135, - "acc_norm": 0.7486398258977149, - "acc_norm_stderr": 0.01012115601681925 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/generation/merged.csv b/2b855b11bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..21946e87ef4d901aa61c341bf6835b595d0193fc --- /dev/null +++ b/2b855b11bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,39 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.033687666872503644 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.033687666872503644 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.16855819384473125 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.16855819384473125 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.18538437537983646 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18538437537983646 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19219003937378554 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19219003937378554 +e2e_nlg_cleaned,3,average,multiple,0.14495506886771423 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04347043245633625 +gem_xsum,0,median,rouge2_fmeasure,0.04347043245633625 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.030277850873655133 +gem_xsum,1,median,rouge2_fmeasure,0.030277850873655133 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03015426920788573 +gem_xsum,2,median,rouge2_fmeasure,0.03015426920788573 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.028265095806897757 +gem_xsum,3,median,rouge2_fmeasure,0.028265095806897757 +gem_xsum,3,average,multiple,0.03304191208619372 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.048503194247737774 +web_nlg_en,0,median,rouge2_fmeasure,0.048503194247737774 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.04633905642415022 +web_nlg_en,1,median,rouge2_fmeasure,0.04633905642415022 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.0482705113559789 +web_nlg_en,2,median,rouge2_fmeasure,0.0482705113559789 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0486263549461623 +web_nlg_en,3,median,rouge2_fmeasure,0.0486263549461623 +web_nlg_en,3,average,multiple,0.047934779243507294 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03524633277968111 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03524633277968111 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04022404252550308 +wiki_lingua_en,1,median,rouge2_fmeasure,0.04022404252550308 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04709721853020564 +wiki_lingua_en,2,median,rouge2_fmeasure,0.04709721853020564 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.039320031366549095 +wiki_lingua_en,3,median,rouge2_fmeasure,0.039320031366549095 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01267310048414024 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01267310048414024 +wiki_lingua_en,4,average,multiple,0.034912145137215835 diff --git a/2b855b11bc4seed3/evaluation/generation/merged.json b/2b855b11bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..bec24199a031a4a4cdc9e46758f14f64b7cb6269 --- /dev/null +++ b/2b855b11bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3003906758006876, "bleu_stderr": 0.027331800335698254, "rouge1_fmeasure": 0.10522756302657624, "rouge1_fmeasure_stderr": 0.0020564917165852665, "rouge1_precision": 0.06963270011987571, "rouge1_precision_stderr": 0.0016168408610962013, "rouge1_recall": 0.29729464197458877, "rouge1_recall_stderr": 0.0047714353450774415, "rouge2_fmeasure": 0.048503194247737774, "rouge2_fmeasure_stderr": 0.001258248656235562, "rouge2_precision": 0.03200346886185345, "rouge2_precision_stderr": 0.000957625394662931, "rouge2_recall": 0.140470675875411, "rouge2_recall_stderr": 0.0032183793551318674, "rougeL_fmeasure": 0.10106959021191211, "rougeL_fmeasure_stderr": 0.0018911974850607549, "rougeL_precision": 0.06646767562482549, "rougeL_precision_stderr": 0.0014351668395974513, "rougeL_recall": 0.28816568568111717, "rougeL_recall_stderr": 0.004622479804486804, "rougeLsum_fmeasure": 0.10010042995723817, "rougeLsum_fmeasure_stderr": 0.001929318666410727, "rougeLsum_precision": 0.06620461518234684, "rougeLsum_precision_stderr": 0.00151460835911578, "rougeLsum_recall": 0.282597952018841, "rougeLsum_recall_stderr": 0.004481787483630335}}, "1": {"PALM_prompt": {"bleu": 0.32423805540348105, "bleu_stderr": 0.024690542644449438, "rouge1_fmeasure": 0.10249916346044939, "rouge1_fmeasure_stderr": 0.0018656887279984073, "rouge1_precision": 0.06685340388470082, "rouge1_precision_stderr": 0.0015253847039623312, "rouge1_recall": 0.323514427623318, "rouge1_recall_stderr": 0.004639043635695871, "rouge2_fmeasure": 0.04633905642415022, "rouge2_fmeasure_stderr": 0.0011829999453798483, "rouge2_precision": 0.030000286950072674, "rouge2_precision_stderr": 0.0008643544653956381, "rouge2_recall": 0.14824220519105014, "rouge2_recall_stderr": 0.0031972015283258494, "rougeL_fmeasure": 0.09712356208435262, "rougeL_fmeasure_stderr": 0.0017347095170401298, "rougeL_precision": 0.06336375493461167, "rougeL_precision_stderr": 0.0014215770633909588, "rougeL_recall": 0.303094617033746, "rougeL_recall_stderr": 0.004191983337899737, "rougeLsum_fmeasure": 0.09830433934297841, "rougeLsum_fmeasure_stderr": 0.0017883137206561263, "rougeLsum_precision": 0.06416576148923457, "rougeLsum_precision_stderr": 0.0014659099244540854, "rougeLsum_recall": 0.30832609036704645, "rougeLsum_recall_stderr": 0.004339774784201872}}, "2": {"PALM_prompt": {"bleu": 0.33841996991287543, "bleu_stderr": 0.013366615983706327, "rouge1_fmeasure": 0.10597979981258299, "rouge1_fmeasure_stderr": 0.0017714336298557038, "rouge1_precision": 0.06765631897767002, "rouge1_precision_stderr": 0.0013046336481251101, "rouge1_recall": 0.3402361179206784, "rouge1_recall_stderr": 0.004704696452770623, "rouge2_fmeasure": 0.0482705113559789, "rouge2_fmeasure_stderr": 0.001132968575984557, "rouge2_precision": 0.03077023716855104, "rouge2_precision_stderr": 0.0008032546127820459, "rouge2_recall": 0.1606656566085052, "rouge2_recall_stderr": 0.0033693273708689585, "rougeL_fmeasure": 0.10046626150365084, "rougeL_fmeasure_stderr": 0.0016642034527337758, "rougeL_precision": 0.06412349572198735, "rougeL_precision_stderr": 0.0012122011908497355, "rougeL_recall": 0.31907776335571136, "rougeL_recall_stderr": 0.004279931218687767, "rougeLsum_fmeasure": 0.10141858866959458, "rougeLsum_fmeasure_stderr": 0.001700099907526394, "rougeLsum_precision": 0.06475854902642648, "rougeLsum_precision_stderr": 0.0012483946453731626, "rougeLsum_recall": 0.3241391405150877, "rougeLsum_recall_stderr": 0.004418718258324822}}, "3": {"PALM_prompt": {"bleu": 0.40584434052309054, "bleu_stderr": 0.02616462426887616, "rouge1_fmeasure": 0.10602175173953958, "rouge1_fmeasure_stderr": 0.001741987498486231, "rouge1_precision": 0.06757351073723121, "rouge1_precision_stderr": 0.0012679743548778455, "rouge1_recall": 0.3440037162848658, "rouge1_recall_stderr": 0.0049119409175133995, "rouge2_fmeasure": 0.0486263549461623, "rouge2_fmeasure_stderr": 0.0010900986383698252, "rouge2_precision": 0.030886249533870456, "rouge2_precision_stderr": 0.0007629220092981514, "rouge2_recall": 0.16556453686704345, "rouge2_recall_stderr": 0.0034492328057988348, "rougeL_fmeasure": 0.10011209012325656, "rougeL_fmeasure_stderr": 0.0016356941636110295, "rougeL_precision": 0.06384738095162408, "rougeL_precision_stderr": 0.0011862226633864972, "rougeL_recall": 0.3209539378169773, "rougeL_recall_stderr": 0.004410637240630364, "rougeLsum_fmeasure": 0.10161889625106496, "rougeLsum_fmeasure_stderr": 0.0016751216394668587, "rougeLsum_precision": 0.06483859720845632, "rougeLsum_precision_stderr": 0.0012202108024313588, "rougeLsum_recall": 0.3270171129897123, "rougeLsum_recall_stderr": 0.0045709512856975595}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4622087249250515, "bleu_stderr": 0.04207910392007385, "rouge1_fmeasure": 0.1767157419817045, "rouge1_fmeasure_stderr": 0.0018123734169975135, "rouge1_precision": 0.1506827740817673, "rouge1_precision_stderr": 0.001852883253242218, "rouge1_recall": 0.25686230777525454, "rouge1_recall_stderr": 0.00260278766504943, "rouge2_fmeasure": 0.03524633277968111, "rouge2_fmeasure_stderr": 0.0008101513042416649, "rouge2_precision": 0.02971277123757481, "rouge2_precision_stderr": 0.000707355818382236, "rouge2_recall": 0.05307915600648516, "rouge2_recall_stderr": 0.0013686059525554353, "rougeL_fmeasure": 0.13759253266149163, "rougeL_fmeasure_stderr": 0.001281432881200856, "rougeL_precision": 0.11575873439522533, "rougeL_precision_stderr": 0.0012737353859565011, "rougeL_recall": 0.2052922273561128, "rougeL_recall_stderr": 0.0021096414533161158, "rougeLsum_fmeasure": 0.16173441706919453, "rougeLsum_fmeasure_stderr": 0.0016450542485275806, "rougeLsum_precision": 0.1374963134041478, "rougeLsum_precision_stderr": 0.0016657330727227028, "rougeLsum_recall": 0.2362819209208244, "rougeLsum_recall_stderr": 0.002418312889630038}}, "1": {"tldr_en": {"bleu": 1.972919037460516, "bleu_stderr": 0.05682843274487135, "rouge1_fmeasure": 0.19185955437187688, "rouge1_fmeasure_stderr": 0.001834411329990832, "rouge1_precision": 0.1655651982573561, "rouge1_precision_stderr": 0.001967434204420624, "rouge1_recall": 0.2778367078547306, "rouge1_recall_stderr": 0.0026376055817718498, "rouge2_fmeasure": 0.04022404252550308, "rouge2_fmeasure_stderr": 0.0009067575883966806, "rouge2_precision": 0.03468172497365085, "rouge2_precision_stderr": 0.0008414429849505186, "rouge2_recall": 0.06032773756466252, "rouge2_recall_stderr": 0.0014755233968311602, "rougeL_fmeasure": 0.1394283655875047, "rougeL_fmeasure_stderr": 0.0012494508344161473, "rougeL_precision": 0.1190010376024555, "rougeL_precision_stderr": 0.0013201442960543233, "rougeL_recall": 0.20730876012944102, "rougeL_recall_stderr": 0.0020918631972439913, "rougeLsum_fmeasure": 0.1798108549879984, "rougeLsum_fmeasure_stderr": 0.00170035102934959, "rougeLsum_precision": 0.15494002660111156, "rougeLsum_precision_stderr": 0.001825015399111526, "rougeLsum_recall": 0.26137076597079567, "rougeLsum_recall_stderr": 0.002487191761867969}}, "2": {"tldr_en": {"bleu": 2.2743708557193902, "bleu_stderr": 0.06189529268097383, "rouge1_fmeasure": 0.20283722636774154, "rouge1_fmeasure_stderr": 0.001838028419312966, "rouge1_precision": 0.17638523863069297, "rouge1_precision_stderr": 0.002020755519678356, "rouge1_recall": 0.2911840829982234, "rouge1_recall_stderr": 0.0026772675380155376, "rouge2_fmeasure": 0.04709721853020564, "rouge2_fmeasure_stderr": 0.000949364569447851, "rouge2_precision": 0.04064975395757813, "rouge2_precision_stderr": 0.0008728139678303188, "rouge2_recall": 0.0700705095693168, "rouge2_recall_stderr": 0.0016162550664557144, "rougeL_fmeasure": 0.14915790559695724, "rougeL_fmeasure_stderr": 0.001266186326972188, "rougeL_precision": 0.128540063496423, "rougeL_precision_stderr": 0.00138883173221342, "rougeL_recall": 0.21916109079967522, "rougeL_recall_stderr": 0.002145845723677153, "rougeLsum_fmeasure": 0.1899159372074719, "rougeLsum_fmeasure_stderr": 0.0017127052022618613, "rougeLsum_precision": 0.16486271659279425, "rougeLsum_precision_stderr": 0.0018790326981489205, "rougeLsum_recall": 0.2736737223320281, "rougeLsum_recall_stderr": 0.0025545391606520626}}, "3": {"tldr_en": {"bleu": 2.261325582734008, "bleu_stderr": 0.07440671017406066, "rouge1_fmeasure": 0.17007228676988045, "rouge1_fmeasure_stderr": 0.0021295726072665254, "rouge1_precision": 0.15457296707448556, "rouge1_precision_stderr": 0.0023843459800257866, "rouge1_recall": 0.24244601688859282, "rouge1_recall_stderr": 0.0031047631304705336, "rouge2_fmeasure": 0.039320031366549095, "rouge2_fmeasure_stderr": 0.0009156298192238671, "rouge2_precision": 0.03563497026047448, "rouge2_precision_stderr": 0.0009697902407615488, "rouge2_recall": 0.05775187966143085, "rouge2_recall_stderr": 0.0014973785566537818, "rougeL_fmeasure": 0.12582015691412737, "rougeL_fmeasure_stderr": 0.0015236321853914047, "rougeL_precision": 0.11385012472654804, "rougeL_precision_stderr": 0.0017484820235318678, "rougeL_recall": 0.18331103625638695, "rougeL_recall_stderr": 0.0024579268481887276, "rougeLsum_fmeasure": 0.15899932831508662, "rougeLsum_fmeasure_stderr": 0.0019839606960568193, "rougeLsum_precision": 0.1443112386465537, "rougeLsum_precision_stderr": 0.002222016297651524, "rougeLsum_recall": 0.22741868584839184, "rougeLsum_recall_stderr": 0.0029377295753792703}}, "4": {"tldr_en": {"bleu": 0.5600187004785872, "bleu_stderr": 0.034329899401054034, "rouge1_fmeasure": 0.054359788577698374, "rouge1_fmeasure_stderr": 0.001834775379938349, "rouge1_precision": 0.05049412311337655, "rouge1_precision_stderr": 0.0018775705845440543, "rouge1_recall": 0.07991007309327065, "rouge1_recall_stderr": 0.002748908637336395, "rouge2_fmeasure": 0.01267310048414024, "rouge2_fmeasure_stderr": 0.0006260839320397936, "rouge2_precision": 0.01142264448143536, "rouge2_precision_stderr": 0.0006812961216581213, "rouge2_recall": 0.02008142422629143, "rouge2_recall_stderr": 0.0010993317750224779, "rougeL_fmeasure": 0.04187829090557222, "rougeL_fmeasure_stderr": 0.001389559278464916, "rougeL_precision": 0.038958076750695146, "rougeL_precision_stderr": 0.0014576150060918941, "rougeL_recall": 0.0629580148095348, "rougeL_recall_stderr": 0.002201566030491766, "rougeLsum_fmeasure": 0.05081727487259366, "rougeLsum_fmeasure_stderr": 0.0017142240431756424, "rougeLsum_precision": 0.047269915920336, "rougeLsum_precision_stderr": 0.0017705102362424203, "rougeLsum_recall": 0.07497632668531767, "rougeLsum_recall_stderr": 0.002589565865103579}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.2918074829245511, "bleu_stderr": 0.03334721114977758, "rouge1_fmeasure": 0.1600243176507681, "rouge1_fmeasure_stderr": 0.0010733278398958096, "rouge1_precision": 0.12561487707508034, "rouge1_precision_stderr": 0.0013825187846368096, "rouge1_recall": 0.25305668157788624, "rouge1_recall_stderr": 0.0013315623225900873, "rouge2_fmeasure": 0.033687666872503644, "rouge2_fmeasure_stderr": 0.0005133425312314838, "rouge2_precision": 0.026668315991920277, "rouge2_precision_stderr": 0.0005306038751357611, "rouge2_recall": 0.05412553888370808, "rouge2_recall_stderr": 0.0007946156915472112, "rougeL_fmeasure": 0.15622553271845338, "rougeL_fmeasure_stderr": 0.0010043221686026895, "rougeL_precision": 0.12177712671014992, "rougeL_precision_stderr": 0.001240068402500465, "rougeL_recall": 0.24840753894632903, "rougeL_recall_stderr": 0.001318527645790754, "rougeLsum_fmeasure": 0.13789602591344347, "rougeLsum_fmeasure_stderr": 0.0009870480569216915, "rougeLsum_precision": 0.10841189553256889, "rougeLsum_precision_stderr": 0.0012618615939597366, "rougeLsum_recall": 0.21839285442742476, "rougeLsum_recall_stderr": 0.0012454131726385412}}, "1": {"generate_text_restaurant": {"bleu": 9.67590369741921, "bleu_stderr": 0.09901656476390079, "rouge1_fmeasure": 0.39957774138885144, "rouge1_fmeasure_stderr": 0.0022278577374241887, "rouge1_precision": 0.45349765710233814, "rouge1_precision_stderr": 0.0029408652429081406, "rouge1_recall": 0.39709628674803366, "rouge1_recall_stderr": 0.002799520545788188, "rouge2_fmeasure": 0.16855819384473125, "rouge2_fmeasure_stderr": 0.001773629505406216, "rouge2_precision": 0.19448143773761467, "rouge2_precision_stderr": 0.002225394284814737, "rouge2_recall": 0.16778603132296563, "rouge2_recall_stderr": 0.0019654958159185396, "rougeL_fmeasure": 0.2890014884493394, "rougeL_fmeasure_stderr": 0.0018207916834884527, "rougeL_precision": 0.33043423858845145, "rougeL_precision_stderr": 0.0024856924454187785, "rougeL_recall": 0.2872561015612187, "rougeL_recall_stderr": 0.0022337982582298155, "rougeLsum_fmeasure": 0.3277959949146152, "rougeLsum_fmeasure_stderr": 0.0021102604579282852, "rougeLsum_precision": 0.3738597805796585, "rougeLsum_precision_stderr": 0.002773351046379512, "rougeLsum_recall": 0.3249821747731812, "rougeLsum_recall_stderr": 0.002515405285337629}}, "2": {"generate_text_restaurant": {"bleu": 10.91262584267496, "bleu_stderr": 0.12763668087089558, "rouge1_fmeasure": 0.4158184072582917, "rouge1_fmeasure_stderr": 0.0021875821824657578, "rouge1_precision": 0.4745007823219277, "rouge1_precision_stderr": 0.003096779743530949, "rouge1_recall": 0.41114421373702703, "rouge1_recall_stderr": 0.002795411314326619, "rouge2_fmeasure": 0.18538437537983646, "rouge2_fmeasure_stderr": 0.001799521509176773, "rouge2_precision": 0.21433784297140893, "rouge2_precision_stderr": 0.002331836767735763, "rouge2_recall": 0.18401740949966608, "rouge2_recall_stderr": 0.002021878491927928, "rougeL_fmeasure": 0.30338837358825094, "rougeL_fmeasure_stderr": 0.001840114059497487, "rougeL_precision": 0.34796254903830837, "rougeL_precision_stderr": 0.002621056636644695, "rougeL_recall": 0.2999843023857228, "rougeL_recall_stderr": 0.002269391756360346, "rougeLsum_fmeasure": 0.3438444001935511, "rougeLsum_fmeasure_stderr": 0.0020872281136323037, "rougeLsum_precision": 0.39290245826296444, "rougeLsum_precision_stderr": 0.0028605438328826085, "rougeLsum_recall": 0.3399505800342592, "rougeLsum_recall_stderr": 0.0025542752245032124}}, "3": {"generate_text_restaurant": {"bleu": 11.413052941435677, "bleu_stderr": 0.20476608953978004, "rouge1_fmeasure": 0.42003617329938386, "rouge1_fmeasure_stderr": 0.002154444681203657, "rouge1_precision": 0.4836426637277247, "rouge1_precision_stderr": 0.003041696429497936, "rouge1_recall": 0.4077462553545315, "rouge1_recall_stderr": 0.002698117142879397, "rouge2_fmeasure": 0.19219003937378554, "rouge2_fmeasure_stderr": 0.0018883864751046853, "rouge2_precision": 0.22368030301486044, "rouge2_precision_stderr": 0.0024069791642568504, "rouge2_recall": 0.18735894030210265, "rouge2_recall_stderr": 0.0020871489281372663, "rougeL_fmeasure": 0.31324626443128384, "rougeL_fmeasure_stderr": 0.0019216880227233767, "rougeL_precision": 0.36189364939827273, "rougeL_precision_stderr": 0.0026889223009721307, "rougeL_recall": 0.30429571782108805, "rougeL_recall_stderr": 0.0023058551990661367, "rougeLsum_fmeasure": 0.35143133030157764, "rougeLsum_fmeasure_stderr": 0.002142111791450537, "rougeLsum_precision": 0.40514213772293617, "rougeLsum_precision_stderr": 0.0029172807069507275, "rougeLsum_recall": 0.341129701516605, "rougeLsum_recall_stderr": 0.0025465447925427515}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7452599273208484, "bleu_stderr": 0.0710626335421352, "rouge1_fmeasure": 0.2032531810901225, "rouge1_fmeasure_stderr": 0.002372876068699817, "rouge1_precision": 0.14818700118728248, "rouge1_precision_stderr": 0.0018676651463230222, "rouge1_recall": 0.34586554355363025, "rouge1_recall_stderr": 0.0041641658204401725, "rouge2_fmeasure": 0.04347043245633625, "rouge2_fmeasure_stderr": 0.001442207400667641, "rouge2_precision": 0.03119345685386172, "rouge2_precision_stderr": 0.0010373843022792317, "rouge2_recall": 0.07667587881403153, "rouge2_recall_stderr": 0.002653660348427432, "rougeL_fmeasure": 0.1528350870269447, "rougeL_fmeasure_stderr": 0.0017715223191776047, "rougeL_precision": 0.11119723252661128, "rougeL_precision_stderr": 0.0013663860116280308, "rougeL_recall": 0.2616100987791669, "rougeL_recall_stderr": 0.003275713171944408, "rougeLsum_fmeasure": 0.16043210769047098, "rougeLsum_fmeasure_stderr": 0.0020055581696195052, "rougeLsum_precision": 0.11658163341774634, "rougeLsum_precision_stderr": 0.0015235968275521983, "rougeLsum_recall": 0.275178892490744, "rougeLsum_recall_stderr": 0.0036968416298391823}}, "1": {"article_DOC_summary": {"bleu": 1.1604575025128918, "bleu_stderr": 0.0877746649601935, "rouge1_fmeasure": 0.16834116026562992, "rouge1_fmeasure_stderr": 0.0023007208182279906, "rouge1_precision": 0.11944768188603072, "rouge1_precision_stderr": 0.0017101747872860968, "rouge1_recall": 0.2967764173333572, "rouge1_recall_stderr": 0.003944563464315701, "rouge2_fmeasure": 0.030277850873655133, "rouge2_fmeasure_stderr": 0.0012385139115491765, "rouge2_precision": 0.021232879052259627, "rouge2_precision_stderr": 0.0008714791712295506, "rouge2_recall": 0.05526393564583485, "rouge2_recall_stderr": 0.0023240893343291387, "rougeL_fmeasure": 0.12978985120588027, "rougeL_fmeasure_stderr": 0.0017321643764184145, "rougeL_precision": 0.09192594935218694, "rougeL_precision_stderr": 0.0012769463345594439, "rougeL_recall": 0.23019911126154574, "rougeL_recall_stderr": 0.003091431421711302, "rougeLsum_fmeasure": 0.13564693066989447, "rougeLsum_fmeasure_stderr": 0.0018863533793062786, "rougeLsum_precision": 0.09602849667357606, "rougeLsum_precision_stderr": 0.001386138103889359, "rougeLsum_recall": 0.2407455765901895, "rougeLsum_recall_stderr": 0.003358547528460653}}, "2": {"article_DOC_summary": {"bleu": 1.1805459815971808, "bleu_stderr": 0.08801398952015627, "rouge1_fmeasure": 0.16960096474510714, "rouge1_fmeasure_stderr": 0.002303219538316631, "rouge1_precision": 0.12017928149855549, "rouge1_precision_stderr": 0.0017155806595076542, "rouge1_recall": 0.3000656582759754, "rouge1_recall_stderr": 0.003942220611475929, "rouge2_fmeasure": 0.03015426920788573, "rouge2_fmeasure_stderr": 0.001263243793733084, "rouge2_precision": 0.02115270486812927, "rouge2_precision_stderr": 0.0008856577807583306, "rouge2_recall": 0.054787883745542026, "rouge2_recall_stderr": 0.0023745228517959276, "rougeL_fmeasure": 0.131124075957363, "rougeL_fmeasure_stderr": 0.0017424293214564424, "rougeL_precision": 0.0927144051400674, "rougeL_precision_stderr": 0.0012857006594715614, "rougeL_recall": 0.23362729816209124, "rougeL_recall_stderr": 0.0031066367525268844, "rougeLsum_fmeasure": 0.1367233861657065, "rougeLsum_fmeasure_stderr": 0.0018905430841393567, "rougeLsum_precision": 0.09669042239875854, "rougeLsum_precision_stderr": 0.0013887653900610069, "rougeLsum_recall": 0.24335819875183146, "rougeLsum_recall_stderr": 0.003364341261182289}}, "3": {"article_DOC_summary": {"bleu": 1.2117893477080361, "bleu_stderr": 0.1149778380988921, "rouge1_fmeasure": 0.15889837582063066, "rouge1_fmeasure_stderr": 0.0024042734536394335, "rouge1_precision": 0.11536457953211869, "rouge1_precision_stderr": 0.0019139955193594944, "rouge1_recall": 0.2768098516237596, "rouge1_recall_stderr": 0.004205225171304428, "rouge2_fmeasure": 0.028265095806897757, "rouge2_fmeasure_stderr": 0.0012619861741006862, "rouge2_precision": 0.020015331184403896, "rouge2_precision_stderr": 0.0009021836723852284, "rouge2_recall": 0.05127981526296904, "rouge2_recall_stderr": 0.002364022279427071, "rougeL_fmeasure": 0.12453970375141683, "rougeL_fmeasure_stderr": 0.0018237860596141172, "rougeL_precision": 0.08997967190382496, "rougeL_precision_stderr": 0.0014235921883162194, "rougeL_recall": 0.21857370135939078, "rougeL_recall_stderr": 0.003310622426465869, "rougeLsum_fmeasure": 0.1276545465727294, "rougeLsum_fmeasure_stderr": 0.001972377201117121, "rougeLsum_precision": 0.09224753114626984, "rougeLsum_precision_stderr": 0.0015272098267094347, "rougeLsum_recall": 0.2238960772414025, "rougeLsum_recall_stderr": 0.003566353512085928}}}} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0.csv b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..6728080a36f3d24cd7b28cbba88e21ad906ddb67 --- /dev/null +++ b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620344,0 +anli_r2,acc,0.335,0.014933117490932573,0 +anli_r3,acc,0.3416666666666667,0.013696658778002515,0 +arc_challenge,acc,0.2354948805460751,0.012399451855004757,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0 +arc_easy,acc,0.5593434343434344,0.010187264635711986,0 +arc_easy,acc_norm,0.49537037037037035,0.010259343705889734,0 +boolq,acc,0.544954128440367,0.008709637955263414,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.19555555555555557,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4342760406293567,0.0049464854665446254,0 +hellaswag,acc_norm,0.5590519816769568,0.0049548591067816675,0 +piqa,acc,0.736126224156692,0.010282996367695562,0 +piqa,acc_norm,0.7383025027203483,0.01025563077270823,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.801,0.01263164908309918,0 +sciq,acc_norm,0.72,0.0142056961040915,0 +storycloze_2016,acc,0.6905398182789952,0.010689956745189074,0 +winogrande,acc,0.5540647198105761,0.01397009348233069,0 diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json deleted file mode 100644 index df90f8bd997a2078434cae91f41d73a41b4b9487..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620344 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002515 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.19555555555555557 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4342760406293567, - "acc_stderr": 0.0049464854665446254, - "acc_norm": 0.5590519816769568, - "acc_norm_stderr": 0.0049548591067816675 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5540647198105761, - "acc_stderr": 0.01397009348233069 - }, - "storycloze_2016": { - "acc": 0.6905398182789952, - "acc_stderr": 0.010689956745189074 - }, - "boolq": { - "acc": 0.544954128440367, - "acc_stderr": 0.008709637955263414 - }, - "arc_easy": { - "acc": 0.5593434343434344, - "acc_stderr": 0.010187264635711986, - "acc_norm": 0.49537037037037035, - "acc_norm_stderr": 0.010259343705889734 - }, - "arc_challenge": { - "acc": 0.2354948805460751, - "acc_stderr": 0.012399451855004757, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252425 - }, - "sciq": { - "acc": 0.801, - "acc_stderr": 0.01263164908309918, - "acc_norm": 0.72, - "acc_norm_stderr": 0.0142056961040915 - }, - "piqa": { - "acc": 0.736126224156692, - "acc_stderr": 0.010282996367695562, - "acc_norm": 0.7383025027203483, - "acc_norm_stderr": 0.01025563077270823 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1.csv b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..3901f08d02b52de52fbabf6742e0860b803b7782 --- /dev/null +++ b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.329,0.014865395385928364,0 +anli_r3,acc,0.3308333333333333,0.013588208070709002,0 +arc_challenge,acc,0.24573378839590443,0.012581033453730107,0 +arc_challenge,acc_norm,0.2713310580204778,0.012993807727545796,0 +arc_easy,acc,0.5660774410774411,0.010169795770462111,0 +arc_easy,acc_norm,0.5446127946127947,0.010218861787618725,0 +boolq,acc,0.5339449541284403,0.00872487854852522,1 +cb,acc,0.32142857142857145,0.06297362289056341,1 +cb,f1,0.2684950416948389,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.4312885879306911,0.0049424407463284975,0 +hellaswag,acc_norm,0.5588528181637125,0.0049550950962647085,0 +piqa,acc,0.7393906420021763,0.010241826155811623,0 +piqa,acc_norm,0.7410228509249184,0.01022096603140561,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.855,0.01113997751789014,0 +sciq,acc_norm,0.83,0.01188449583454167,0 +storycloze_2016,acc,0.6734366648850882,0.010844543793668893,0 +winogrande,acc,0.5414364640883977,0.014004146853791906,0 diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index 24714c91baffaf89eab7af652d629086b97deb2c..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928364 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709002 - }, - "cb": { - "acc": 0.32142857142857145, - "acc_stderr": 0.06297362289056341, - "f1": 0.2684950416948389 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.4312885879306911, - "acc_stderr": 0.0049424407463284975, - "acc_norm": 0.5588528181637125, - "acc_norm_stderr": 0.0049550950962647085 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5414364640883977, - "acc_stderr": 0.014004146853791906 - }, - "storycloze_2016": { - "acc": 0.6734366648850882, - "acc_stderr": 0.010844543793668893 - }, - "boolq": { - "acc": 0.5339449541284403, - "acc_stderr": 0.00872487854852522 - }, - "arc_easy": { - "acc": 0.5660774410774411, - "acc_stderr": 0.010169795770462111, - "acc_norm": 0.5446127946127947, - "acc_norm_stderr": 0.010218861787618725 - }, - "arc_challenge": { - "acc": 0.24573378839590443, - "acc_stderr": 0.012581033453730107, - "acc_norm": 0.2713310580204778, - "acc_norm_stderr": 0.012993807727545796 - }, - "sciq": { - "acc": 0.855, - "acc_stderr": 0.01113997751789014, - "acc_norm": 0.83, - "acc_norm_stderr": 0.01188449583454167 - }, - "piqa": { - "acc": 0.7393906420021763, - "acc_stderr": 0.010241826155811623, - "acc_norm": 0.7410228509249184, - "acc_norm_stderr": 0.01022096603140561 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2.csv b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..c23b3361e3d61cc584d3df4c8175e436026acf35 --- /dev/null +++ b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928366,0 +anli_r2,acc,0.325,0.014818724459095526,0 +anli_r3,acc,0.3358333333333333,0.013639261190932882,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207708,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252427,0 +arc_easy,acc,0.5774410774410774,0.010135978222981078,0 +arc_easy,acc_norm,0.5576599326599326,0.010191334444220851,0 +boolq,acc,0.5397553516819572,0.008717368239786054,1 +cb,acc,0.17857142857142858,0.05164277182008721,1 +cb,f1,0.15455455455455455,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.4291973710416252,0.004939500404882189,0 +hellaswag,acc_norm,0.5610436168094005,0.004952454721934799,0 +piqa,acc,0.7383025027203483,0.010255630772708229,0 +piqa,acc_norm,0.735038084874864,0.010296557993316044,0 +rte,acc,0.5018050541516246,0.030096267148976626,0 +sciq,acc,0.875,0.010463483381956722,0 +sciq,acc_norm,0.85,0.011297239823409296,0 +storycloze_2016,acc,0.6819882415820417,0.010769343495248553,0 +winogrande,acc,0.5706393054459353,0.01391153749996916,0 diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json deleted file mode 100644 index a18344551700975a8f17186f3e144bbb4adf8215..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928366 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932882 - }, - "cb": { - "acc": 0.17857142857142858, - "acc_stderr": 0.05164277182008721, - "f1": 0.15455455455455455 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.4291973710416252, - "acc_stderr": 0.004939500404882189, - "acc_norm": 0.5610436168094005, - "acc_norm_stderr": 0.004952454721934799 - }, - "rte": { - "acc": 0.5018050541516246, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.01391153749996916 - }, - "storycloze_2016": { - "acc": 0.6819882415820417, - "acc_stderr": 0.010769343495248553 - }, - "boolq": { - "acc": 0.5397553516819572, - "acc_stderr": 0.008717368239786054 - }, - "arc_easy": { - "acc": 0.5774410774410774, - "acc_stderr": 0.010135978222981078, - "acc_norm": 0.5576599326599326, - "acc_norm_stderr": 0.010191334444220851 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207708, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252427 - }, - "sciq": { - "acc": 0.875, - "acc_stderr": 0.010463483381956722, - "acc_norm": 0.85, - "acc_norm_stderr": 0.011297239823409296 - }, - "piqa": { - "acc": 0.7383025027203483, - "acc_stderr": 0.010255630772708229, - "acc_norm": 0.735038084874864, - "acc_norm_stderr": 0.010296557993316044 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3.csv b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..e7302426327497540f4ad39a7103835518f4709d --- /dev/null +++ b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.337,0.0149550879186536,0 +anli_r2,acc,0.335,0.014933117490932573,0 +anli_r3,acc,0.3233333333333333,0.013508372867300219,0 +arc_challenge,acc,0.24829351535836178,0.012624912868089753,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252428,0 +arc_easy,acc,0.5816498316498316,0.010122061470742861,0 +arc_easy,acc_norm,0.5627104377104377,0.010178768429321588,0 +boolq,acc,0.544954128440367,0.008709637955263421,1 +cb,acc,0.2857142857142857,0.06091449038731724,1 +cb,f1,0.24848484848484845,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.43248356901015733,0.004944080605048776,0 +hellaswag,acc_norm,0.5600477992431786,0.004953667028654382,0 +piqa,acc,0.7415669205658324,0.01021397163677332,0 +piqa,acc_norm,0.735038084874864,0.010296557993316038,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.884,0.010131468138756995,0 +sciq,acc_norm,0.861,0.010945263761042965,0 +storycloze_2016,acc,0.6787814003206841,0.010798029402794916,0 +winogrande,acc,0.5406471981057617,0.014005973823825135,0 diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index bb12ce2718a9656092e4da5ad86c28bd16e97301..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.337, - "acc_stderr": 0.0149550879186536 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.013508372867300219 - }, - "cb": { - "acc": 0.2857142857142857, - "acc_stderr": 0.06091449038731724, - "f1": 0.24848484848484845 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.43248356901015733, - "acc_stderr": 0.004944080605048776, - "acc_norm": 0.5600477992431786, - "acc_norm_stderr": 0.004953667028654382 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5406471981057617, - "acc_stderr": 0.014005973823825135 - }, - "storycloze_2016": { - "acc": 0.6787814003206841, - "acc_stderr": 0.010798029402794916 - }, - "boolq": { - "acc": 0.544954128440367, - "acc_stderr": 0.008709637955263421 - }, - "arc_easy": { - "acc": 0.5816498316498316, - "acc_stderr": 0.010122061470742861, - "acc_norm": 0.5627104377104377, - "acc_norm_stderr": 0.010178768429321588 - }, - "arc_challenge": { - "acc": 0.24829351535836178, - "acc_stderr": 0.012624912868089753, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252428 - }, - "sciq": { - "acc": 0.884, - "acc_stderr": 0.010131468138756995, - "acc_norm": 0.861, - "acc_norm_stderr": 0.010945263761042965 - }, - "piqa": { - "acc": 0.7415669205658324, - "acc_stderr": 0.01021397163677332, - "acc_norm": 0.735038084874864, - "acc_norm_stderr": 0.010296557993316038 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4.csv b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..760f47dd29833eb45d36d86c2601056121d058c7 --- /dev/null +++ b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795025,0 +anli_r2,acc,0.33,0.014876872027456732,0 +anli_r3,acc,0.3375,0.013655897185463655,0 +arc_challenge,acc,0.24829351535836178,0.012624912868089762,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252428,0 +arc_easy,acc,0.5845959595959596,0.010111869494911519,0 +arc_easy,acc_norm,0.5715488215488216,0.010154195733990965,0 +boolq,acc,0.5516819571865443,0.008698213008694273,1 +cb,acc,0.2857142857142857,0.06091449038731725,1 +cb,f1,0.26612466124661244,,1 +copa,acc,0.78,0.041633319989322626,0 +hellaswag,acc,0.43168691495717987,0.004942990623131125,0 +hellaswag,acc_norm,0.5638319059948218,0.004948952519517512,0 +piqa,acc,0.735582154515778,0.01028978724476717,0 +piqa,acc_norm,0.735582154515778,0.01028978724476716,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.882,0.010206869264381791,0 +sciq,acc_norm,0.864,0.01084535023047299,0 +storycloze_2016,acc,0.6862640299305185,0.010730179119317623,0 +winogrande,acc,0.5501183898973955,0.013981711904049732,0 diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index 21cc2c71d0276b7d99c241c170fe628b45a71f44..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795025 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.014876872027456732 - }, - "anli_r3": { - "acc": 0.3375, - "acc_stderr": 0.013655897185463655 - }, - "cb": { - "acc": 0.2857142857142857, - "acc_stderr": 0.06091449038731725, - "f1": 0.26612466124661244 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.041633319989322626 - }, - "hellaswag": { - "acc": 0.43168691495717987, - "acc_stderr": 0.004942990623131125, - "acc_norm": 0.5638319059948218, - "acc_norm_stderr": 0.004948952519517512 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5501183898973955, - "acc_stderr": 0.013981711904049732 - }, - "storycloze_2016": { - "acc": 0.6862640299305185, - "acc_stderr": 0.010730179119317623 - }, - "boolq": { - "acc": 0.5516819571865443, - "acc_stderr": 0.008698213008694273 - }, - "arc_easy": { - "acc": 0.5845959595959596, - "acc_stderr": 0.010111869494911519, - "acc_norm": 0.5715488215488216, - "acc_norm_stderr": 0.010154195733990965 - }, - "arc_challenge": { - "acc": 0.24829351535836178, - "acc_stderr": 0.012624912868089762, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252428 - }, - "sciq": { - "acc": 0.882, - "acc_stderr": 0.010206869264381791, - "acc_norm": 0.864, - "acc_norm_stderr": 0.01084535023047299 - }, - "piqa": { - "acc": 0.735582154515778, - "acc_stderr": 0.01028978724476717, - "acc_norm": 0.735582154515778, - "acc_norm_stderr": 0.01028978724476716 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5.csv b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..8738e6e78c6e80fe3506670851c55ff039070660 --- /dev/null +++ b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.35,0.015090650341444233,0 +anli_r2,acc,0.334,0.014922019523732967,0 +anli_r3,acc,0.3175,0.013443538681348061,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378606,0 +arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0 +arc_easy,acc,0.5879629629629629,0.010099765857562762,0 +arc_easy,acc_norm,0.569023569023569,0.010161552863493746,0 +boolq,acc,0.5467889908256881,0.008706681265872488,1 +cb,acc,0.26785714285714285,0.05971290310957636,1 +cb,f1,0.24955436720142601,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.42999402509460266,0.004940631135803533,0 +hellaswag,acc_norm,0.566620195180243,0.004945291270072436,0 +piqa,acc,0.735038084874864,0.010296557993316052,0 +piqa,acc_norm,0.7448313384113167,0.01017157159252183,0 +rte,acc,0.5054151624548736,0.030094698123239966,0 +sciq,acc,0.889,0.009938701010583726,0 +sciq,acc_norm,0.879,0.010318210380946097,0 +storycloze_2016,acc,0.6803848209513629,0.01078375973373075,0 +winogrande,acc,0.5390686661404893,0.014009521680980306,0 diff --git a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index b7ab16119c81f7f78666c6a18b53d293128a453a..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed3/evaluation/rankeval/2b855b11bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.35, - "acc_stderr": 0.015090650341444233 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732967 - }, - "anli_r3": { - "acc": 0.3175, - "acc_stderr": 0.013443538681348061 - }, - "cb": { - "acc": 0.26785714285714285, - "acc_stderr": 0.05971290310957636, - "f1": 0.24955436720142601 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.42999402509460266, - "acc_stderr": 0.004940631135803533, - "acc_norm": 0.566620195180243, - "acc_norm_stderr": 0.004945291270072436 - }, - "rte": { - "acc": 0.5054151624548736, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5390686661404893, - "acc_stderr": 0.014009521680980306 - }, - "storycloze_2016": { - "acc": 0.6803848209513629, - "acc_stderr": 0.01078375973373075 - }, - "boolq": { - "acc": 0.5467889908256881, - "acc_stderr": 0.008706681265872488 - }, - "arc_easy": { - "acc": 0.5879629629629629, - "acc_stderr": 0.010099765857562762, - "acc_norm": 0.569023569023569, - "acc_norm_stderr": 0.010161552863493746 - }, - "arc_challenge": { - "acc": 0.25341296928327645, - "acc_stderr": 0.012710896778378606, - "acc_norm": 0.2832764505119454, - "acc_norm_stderr": 0.013167478735134575 - }, - "sciq": { - "acc": 0.889, - "acc_stderr": 0.009938701010583726, - "acc_norm": 0.879, - "acc_norm_stderr": 0.010318210380946097 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316052, - "acc_norm": 0.7448313384113167, - "acc_norm_stderr": 0.01017157159252183 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0.csv b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..c96e2bbc8c77eed3b833012165c7dd56e4e2b003 --- /dev/null +++ b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.014830507204541033,0 +anli_r2,acc,0.332,0.014899597242811476,0 +anli_r3,acc,0.33,0.013579531277800925,0 +arc_challenge,acc,0.2431740614334471,0.01253655414458709,0 +arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0 +arc_easy,acc,0.5698653198653199,0.010159130445178499,0 +arc_easy,acc_norm,0.49957912457912457,0.010259779886094424,0 +boolq,acc,0.6055045871559633,0.008548152025770936,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.2884615384615385,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.43686516630153355,0.004949842967331425,0 +hellaswag,acc_norm,0.5622385978888668,0.004950973231188733,0 +piqa,acc,0.7399347116430903,0.010234893249061308,0 +piqa,acc_norm,0.7431991294885746,0.010192864802278033,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.819,0.012181436179177923,0 +sciq,acc_norm,0.747,0.01375427861358708,0 +storycloze_2016,acc,0.6980224478888295,0.010616985436073357,0 +winogrande,acc,0.5611681136543015,0.013946933444507032,0 diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index 85374229bac0b9c437c8c97349be4d2091becd47..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.014830507204541033 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811476 - }, - "anli_r3": { - "acc": 0.33, - "acc_stderr": 0.013579531277800925 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.2884615384615385 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.43686516630153355, - "acc_stderr": 0.004949842967331425, - "acc_norm": 0.5622385978888668, - "acc_norm_stderr": 0.004950973231188733 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5611681136543015, - "acc_stderr": 0.013946933444507032 - }, - "storycloze_2016": { - "acc": 0.6980224478888295, - "acc_stderr": 0.010616985436073357 - }, - "boolq": { - "acc": 0.6055045871559633, - "acc_stderr": 0.008548152025770936 - }, - "arc_easy": { - "acc": 0.5698653198653199, - "acc_stderr": 0.010159130445178499, - "acc_norm": 0.49957912457912457, - "acc_norm_stderr": 0.010259779886094424 - }, - "arc_challenge": { - "acc": 0.2431740614334471, - "acc_stderr": 0.01253655414458709, - "acc_norm": 0.2773037542662116, - "acc_norm_stderr": 0.013082095839059374 - }, - "sciq": { - "acc": 0.819, - "acc_stderr": 0.012181436179177923, - "acc_norm": 0.747, - "acc_norm_stderr": 0.01375427861358708 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.010234893249061308, - "acc_norm": 0.7431991294885746, - "acc_norm_stderr": 0.010192864802278033 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1.csv b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..d25ab42136d6ed1ae90a5b6488c830cef87adbb9 --- /dev/null +++ b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.306,0.01458000605543697,0 +anli_r2,acc,0.322,0.014782913600996669,0 +anli_r3,acc,0.3416666666666667,0.013696658778002519,0 +arc_challenge,acc,0.24744027303754265,0.012610352663292673,0 +arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0 +arc_easy,acc,0.5744949494949495,0.01014527118259102,0 +arc_easy,acc_norm,0.5391414141414141,0.010228298200766126,0 +boolq,acc,0.5761467889908257,0.008643046537505764,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.3268421052631579,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.43596893049193386,0.004948696280312425,0 +hellaswag,acc_norm,0.5642302330213105,0.004948439229523909,0 +piqa,acc,0.7366702937976061,0.010276185322196766,0 +piqa,acc_norm,0.7372143634385201,0.010269354068140783,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.869,0.010674874844837952,0 +sciq,acc_norm,0.853,0.011203415395160333,0 +storycloze_2016,acc,0.6926777124532336,0.01066944508186666,0 +winogrande,acc,0.55327545382794,0.013972488371616692,0 diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index f4052c6311c270f329dee652b67eca42805777fb..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.306, - "acc_stderr": 0.01458000605543697 - }, - "anli_r2": { - "acc": 0.322, - "acc_stderr": 0.014782913600996669 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002519 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.3268421052631579 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.43596893049193386, - "acc_stderr": 0.004948696280312425, - "acc_norm": 0.5642302330213105, - "acc_norm_stderr": 0.004948439229523909 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.55327545382794, - "acc_stderr": 0.013972488371616692 - }, - "storycloze_2016": { - "acc": 0.6926777124532336, - "acc_stderr": 0.01066944508186666 - }, - "boolq": { - "acc": 0.5761467889908257, - "acc_stderr": 0.008643046537505764 - }, - "arc_easy": { - "acc": 0.5744949494949495, - "acc_stderr": 0.01014527118259102, - "acc_norm": 0.5391414141414141, - "acc_norm_stderr": 0.010228298200766126 - }, - "arc_challenge": { - "acc": 0.24744027303754265, - "acc_stderr": 0.012610352663292673, - "acc_norm": 0.27559726962457337, - "acc_norm_stderr": 0.013057169655761838 - }, - "sciq": { - "acc": 0.869, - "acc_stderr": 0.010674874844837952, - "acc_norm": 0.853, - "acc_norm_stderr": 0.011203415395160333 - }, - "piqa": { - "acc": 0.7366702937976061, - "acc_stderr": 0.010276185322196766, - "acc_norm": 0.7372143634385201, - "acc_norm_stderr": 0.010269354068140783 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2.csv b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..3ce65fccc5d20ec35f93dafd924fea08e5c04c43 --- /dev/null +++ b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.014794927843348639,0 +anli_r2,acc,0.311,0.014645596385722699,0 +anli_r3,acc,0.32666666666666666,0.013544340907003663,0 +arc_challenge,acc,0.2593856655290102,0.012808273573927104,0 +arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0 +arc_easy,acc,0.5833333333333334,0.010116282977781239,0 +arc_easy,acc_norm,0.5711279461279462,0.010155440652900152,0 +boolq,acc,0.573394495412844,0.008650327037726273,1 +cb,acc,0.2857142857142857,0.060914490387317236,1 +cb,f1,0.21909633418584828,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.43537143995220073,0.004947922692688842,0 +hellaswag,acc_norm,0.5636327424815774,0.004949207947265913,0 +piqa,acc,0.736126224156692,0.010282996367695564,0 +piqa,acc_norm,0.7442872687704026,0.010178690109459872,0 +rte,acc,0.48375451263537905,0.030080573208738064,0 +sciq,acc,0.877,0.010391293421849874,0 +sciq,acc_norm,0.858,0.011043457699378215,0 +storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0 +winogrande,acc,0.5453827940015785,0.013994481027066,0 diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index a35d19054b2746974db6775131d14e79635df48f..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348639 - }, - "anli_r2": { - "acc": 0.311, - "acc_stderr": 0.014645596385722699 - }, - "anli_r3": { - "acc": 0.32666666666666666, - "acc_stderr": 0.013544340907003663 - }, - "cb": { - "acc": 0.2857142857142857, - "acc_stderr": 0.060914490387317236, - "f1": 0.21909633418584828 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.43537143995220073, - "acc_stderr": 0.004947922692688842, - "acc_norm": 0.5636327424815774, - "acc_norm_stderr": 0.004949207947265913 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5453827940015785, - "acc_stderr": 0.013994481027066 - }, - "storycloze_2016": { - "acc": 0.6937466595403528, - "acc_stderr": 0.010659088460112754 - }, - "boolq": { - "acc": 0.573394495412844, - "acc_stderr": 0.008650327037726273 - }, - "arc_easy": { - "acc": 0.5833333333333334, - "acc_stderr": 0.010116282977781239, - "acc_norm": 0.5711279461279462, - "acc_norm_stderr": 0.010155440652900152 - }, - "arc_challenge": { - "acc": 0.2593856655290102, - "acc_stderr": 0.012808273573927104, - "acc_norm": 0.2773037542662116, - "acc_norm_stderr": 0.013082095839059374 - }, - "sciq": { - "acc": 0.877, - "acc_stderr": 0.010391293421849874, - "acc_norm": 0.858, - "acc_norm_stderr": 0.011043457699378215 - }, - "piqa": { - "acc": 0.736126224156692, - "acc_stderr": 0.010282996367695564, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.010178690109459872 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3.csv b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..8922309430a5432b26c15d5455831860166bb239 --- /dev/null +++ b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.298,0.014470846741134694,0 +anli_r2,acc,0.353,0.015120172605483687,0 +anli_r3,acc,0.3425,0.013704669762934732,0 +arc_challenge,acc,0.25170648464163825,0.01268249633404296,0 +arc_challenge,acc_norm,0.2790102389078498,0.013106784883601336,0 +arc_easy,acc,0.5757575757575758,0.010141333654958562,0 +arc_easy,acc_norm,0.5648148148148148,0.010173216430370917,0 +boolq,acc,0.5865443425076453,0.008613059239942643,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.30450234601177995,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.43527185819557856,0.004947793051042668,0 +hellaswag,acc_norm,0.5651264688309102,0.00494727245422622,0 +piqa,acc,0.7399347116430903,0.010234893249061306,0 +piqa,acc_norm,0.750816104461371,0.010091882770120214,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.883,0.010169287802713329,0 +sciq,acc_norm,0.859,0.011010914595992441,0 +storycloze_2016,acc,0.6974879743452699,0.010622307774396943,0 +winogrande,acc,0.5595895816890292,0.0139523303119156,0 diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index 764cf88d098541f438521fb8f5adfde06290148d..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.298, - "acc_stderr": 0.014470846741134694 - }, - "anli_r2": { - "acc": 0.353, - "acc_stderr": 0.015120172605483687 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934732 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.30450234601177995 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.43527185819557856, - "acc_stderr": 0.004947793051042668, - "acc_norm": 0.5651264688309102, - "acc_norm_stderr": 0.00494727245422622 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5595895816890292, - "acc_stderr": 0.0139523303119156 - }, - "storycloze_2016": { - "acc": 0.6974879743452699, - "acc_stderr": 0.010622307774396943 - }, - "boolq": { - "acc": 0.5865443425076453, - "acc_stderr": 0.008613059239942643 - }, - "arc_easy": { - "acc": 0.5757575757575758, - "acc_stderr": 0.010141333654958562, - "acc_norm": 0.5648148148148148, - "acc_norm_stderr": 0.010173216430370917 - }, - "arc_challenge": { - "acc": 0.25170648464163825, - "acc_stderr": 0.01268249633404296, - "acc_norm": 0.2790102389078498, - "acc_norm_stderr": 0.013106784883601336 - }, - "sciq": { - "acc": 0.883, - "acc_stderr": 0.010169287802713329, - "acc_norm": 0.859, - "acc_norm_stderr": 0.011010914595992441 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.010234893249061306, - "acc_norm": 0.750816104461371, - "acc_norm_stderr": 0.010091882770120214 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4.csv b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..5bc231c7ce7ab11d56ca68457ea345ab63a07dae --- /dev/null +++ b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.01486539538592837,0 +anli_r2,acc,0.318,0.0147340793093119,0 +anli_r3,acc,0.3325,0.013605417345710528,0 +arc_challenge,acc,0.2551194539249147,0.012739038695202104,0 +arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0 +arc_easy,acc,0.5744949494949495,0.010145271182591018,0 +arc_easy,acc_norm,0.5669191919191919,0.010167478013701792,0 +boolq,acc,0.582262996941896,0.008625883905552707,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.3471907281431091,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.43487353116908983,0.004947272454226208,0 +hellaswag,acc_norm,0.5681139215295757,0.004943264339868658,0 +piqa,acc,0.7426550598476604,0.01019992106479251,0 +piqa,acc_norm,0.7470076169749728,0.010142888698862455,0 +rte,acc,0.51985559566787,0.030072723167317184,0 +sciq,acc,0.884,0.010131468138757,0 +sciq,acc_norm,0.875,0.010463483381956722,0 +storycloze_2016,acc,0.6937466595403528,0.010659088460112754,0 +winogrande,acc,0.5714285714285714,0.013908353814606691,0 diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json deleted file mode 100644 index 9d0379266b0c1b61781d2488c5429b33bfdb358a..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.01486539538592837 - }, - "anli_r2": { - "acc": 0.318, - "acc_stderr": 0.0147340793093119 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.013605417345710528 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.3471907281431091 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.43487353116908983, - "acc_stderr": 0.004947272454226208, - "acc_norm": 0.5681139215295757, - "acc_norm_stderr": 0.004943264339868658 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317184 - }, - "winogrande": { - "acc": 0.5714285714285714, - "acc_stderr": 0.013908353814606691 - }, - "storycloze_2016": { - "acc": 0.6937466595403528, - "acc_stderr": 0.010659088460112754 - }, - "boolq": { - "acc": 0.582262996941896, - "acc_stderr": 0.008625883905552707 - }, - "arc_easy": { - "acc": 0.5744949494949495, - "acc_stderr": 0.010145271182591018, - "acc_norm": 0.5669191919191919, - "acc_norm_stderr": 0.010167478013701792 - }, - "arc_challenge": { - "acc": 0.2551194539249147, - "acc_stderr": 0.012739038695202104, - "acc_norm": 0.2841296928327645, - "acc_norm_stderr": 0.013179442447653886 - }, - "sciq": { - "acc": 0.884, - "acc_stderr": 0.010131468138757, - "acc_norm": 0.875, - "acc_norm_stderr": 0.010463483381956722 - }, - "piqa": { - "acc": 0.7426550598476604, - "acc_stderr": 0.01019992106479251, - "acc_norm": 0.7470076169749728, - "acc_norm_stderr": 0.010142888698862455 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5.csv b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..f85accc9982e938a7d8b4909939a11c728d302ab --- /dev/null +++ b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.346,0.015050266127564443,0 +anli_r3,acc,0.3383333333333333,0.013664144006618261,0 +arc_challenge,acc,0.2687713310580205,0.012955065963710695,0 +arc_challenge,acc_norm,0.27986348122866894,0.013119040897725923,0 +arc_easy,acc,0.5782828282828283,0.01013325528401233,0 +arc_easy,acc_norm,0.5711279461279462,0.010155440652900152,0 +boolq,acc,0.5886850152905199,0.008606395426309215,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.2634620436038876,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.43547102170882296,0.004948052131344501,0 +hellaswag,acc_norm,0.5694084843656642,0.004941470620074855,0 +piqa,acc,0.7388465723612623,0.010248738649935576,0 +piqa,acc_norm,0.7442872687704026,0.010178690109459867,0 +rte,acc,0.5451263537906137,0.029973636495415255,0 +sciq,acc,0.89,0.00989939381972445,0 +sciq,acc_norm,0.885,0.010093407594904631,0 +storycloze_2016,acc,0.6985569214323891,0.010611646032767584,0 +winogrande,acc,0.5524861878453039,0.013974847640536199,0 diff --git a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index 28b8d91ab6780315f96d73bfff197dd4904602ba..0000000000000000000000000000000000000000 --- a/2b855b11bc4seed4/evaluation/rankeval/2b855b11bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564443 - }, - "anli_r3": { - "acc": 0.3383333333333333, - "acc_stderr": 0.013664144006618261 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.2634620436038876 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.43547102170882296, - "acc_stderr": 0.004948052131344501, - "acc_norm": 0.5694084843656642, - "acc_norm_stderr": 0.004941470620074855 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415255 - }, - "winogrande": { - "acc": 0.5524861878453039, - "acc_stderr": 0.013974847640536199 - }, - "storycloze_2016": { - "acc": 0.6985569214323891, - "acc_stderr": 0.010611646032767584 - }, - "boolq": { - "acc": 0.5886850152905199, - "acc_stderr": 0.008606395426309215 - }, - "arc_easy": { - "acc": 0.5782828282828283, - "acc_stderr": 0.01013325528401233, - "acc_norm": 0.5711279461279462, - "acc_norm_stderr": 0.010155440652900152 - }, - "arc_challenge": { - "acc": 0.2687713310580205, - "acc_stderr": 0.012955065963710695, - "acc_norm": 0.27986348122866894, - "acc_norm_stderr": 0.013119040897725923 - }, - "sciq": { - "acc": 0.89, - "acc_stderr": 0.00989939381972445, - "acc_norm": 0.885, - "acc_norm_stderr": 0.010093407594904631 - }, - "piqa": { - "acc": 0.7388465723612623, - "acc_stderr": 0.010248738649935576, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.010178690109459867 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/generation/merged.csv b/2b855b14bc4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d5b49a4a3ffc756a48d67c045bdc657af0502b86 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00012949433318118156 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00012949433318118156 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.18224154640367043 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.18224154640367043 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20392339700886974 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20392339700886974 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.20537187889646735 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.20537187889646735 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.200121282410508 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.200121282410508 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19429987532612708 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19429987532612708 +e2e_nlg_cleaned,5,average,multiple,0.16434791239647062 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.044020752612575986 +gem_xsum,0,median,rouge2_fmeasure,0.044020752612575986 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.02968537882036386 +gem_xsum,1,median,rouge2_fmeasure,0.02968537882036386 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.030040223235727605 +gem_xsum,2,median,rouge2_fmeasure,0.030040223235727605 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.030050510110479087 +gem_xsum,3,median,rouge2_fmeasure,0.030050510110479087 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.007253870650249119 +gem_xsum,4,median,rouge2_fmeasure,0.007253870650249119 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,8.848593754254134e-05 +gem_xsum,5,median,rouge2_fmeasure,8.848593754254134e-05 +gem_xsum,5,average,multiple,0.023523203561156367 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.052796392998268285 +web_nlg_en,0,median,rouge2_fmeasure,0.052796392998268285 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05226153564491622 +web_nlg_en,1,median,rouge2_fmeasure,0.05226153564491622 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05325778753738895 +web_nlg_en,2,median,rouge2_fmeasure,0.05325778753738895 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05116400683224674 +web_nlg_en,3,median,rouge2_fmeasure,0.05116400683224674 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0543334322796211 +web_nlg_en,4,median,rouge2_fmeasure,0.0543334322796211 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054406851908666655 +web_nlg_en,5,median,rouge2_fmeasure,0.054406851908666655 +web_nlg_en,5,average,multiple,0.05303666786685132 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03281166114649338 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03281166114649338 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03952348883731413 +wiki_lingua_en,1,median,rouge2_fmeasure,0.03952348883731413 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04385052776628551 +wiki_lingua_en,2,median,rouge2_fmeasure,0.04385052776628551 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03576925556291925 +wiki_lingua_en,3,median,rouge2_fmeasure,0.03576925556291925 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.0115081158992026 +wiki_lingua_en,4,median,rouge2_fmeasure,0.0115081158992026 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0019115778864576577 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0019115778864576577 +wiki_lingua_en,5,average,multiple,0.027562437849778753 diff --git a/2b855b14bc4seed1/evaluation/generation/merged.json b/2b855b14bc4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..0d97dddfabf645fda83f3e31e63eeff201d5451c --- /dev/null +++ b/2b855b14bc4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3859706493814145, "bleu_stderr": 0.02887493788243282, "rouge1_fmeasure": 0.11155152422680106, "rouge1_fmeasure_stderr": 0.002282891780760996, "rouge1_precision": 0.07428960078453671, "rouge1_precision_stderr": 0.0018063226287394017, "rouge1_recall": 0.30350663746210643, "rouge1_recall_stderr": 0.00473697872155852, "rouge2_fmeasure": 0.052796392998268285, "rouge2_fmeasure_stderr": 0.0013951384332651713, "rouge2_precision": 0.03531981209056992, "rouge2_precision_stderr": 0.0011595867013095926, "rouge2_recall": 0.14540106420377438, "rouge2_recall_stderr": 0.003142842212970267, "rougeL_fmeasure": 0.10653891951191909, "rougeL_fmeasure_stderr": 0.0020787485289465255, "rougeL_precision": 0.07055647736310881, "rougeL_precision_stderr": 0.0016283929067047295, "rougeL_recall": 0.2935720196205313, "rougeL_recall_stderr": 0.0045875931258363735, "rougeLsum_fmeasure": 0.10565401215529986, "rougeLsum_fmeasure_stderr": 0.0021263606523734837, "rougeLsum_precision": 0.07024334129648914, "rougeLsum_precision_stderr": 0.0016843891256844708, "rougeLsum_recall": 0.288622206208257, "rougeLsum_recall_stderr": 0.004462405964590755}}, "1": {"PALM_prompt": {"bleu": 0.4498997351689477, "bleu_stderr": 0.03674862617096384, "rouge1_fmeasure": 0.11423642365402245, "rouge1_fmeasure_stderr": 0.002125553076824774, "rouge1_precision": 0.07474282486730215, "rouge1_precision_stderr": 0.0017060899291641667, "rouge1_recall": 0.35037818287785766, "rouge1_recall_stderr": 0.005089644035448327, "rouge2_fmeasure": 0.05226153564491622, "rouge2_fmeasure_stderr": 0.0012866470017600612, "rouge2_precision": 0.033947865346798244, "rouge2_precision_stderr": 0.0009605222606600222, "rouge2_recall": 0.16651493031375508, "rouge2_recall_stderr": 0.0034551314473355676, "rougeL_fmeasure": 0.1075089911234956, "rougeL_fmeasure_stderr": 0.0019013555079034736, "rougeL_precision": 0.07006889806462963, "rougeL_precision_stderr": 0.0015199133453655971, "rougeL_recall": 0.3313741675277014, "rougeL_recall_stderr": 0.004724938074588536, "rougeLsum_fmeasure": 0.1080259820412502, "rougeLsum_fmeasure_stderr": 0.001985969459394615, "rougeLsum_precision": 0.07068151425631596, "rougeLsum_precision_stderr": 0.0016036772272344858, "rougeLsum_recall": 0.330968980956117, "rougeLsum_recall_stderr": 0.004694246590162869}}, "2": {"PALM_prompt": {"bleu": 0.4753049001409591, "bleu_stderr": 0.022662210056026047, "rouge1_fmeasure": 0.11668735202155088, "rouge1_fmeasure_stderr": 0.001969072957510736, "rouge1_precision": 0.07509396872848731, "rouge1_precision_stderr": 0.001495770634509107, "rouge1_recall": 0.369650217038538, "rouge1_recall_stderr": 0.004969240424808801, "rouge2_fmeasure": 0.05325778753738895, "rouge2_fmeasure_stderr": 0.0012240840116235934, "rouge2_precision": 0.034222534671223666, "rouge2_precision_stderr": 0.0009009097779622381, "rouge2_recall": 0.17744398005391065, "rouge2_recall_stderr": 0.003538000874923632, "rougeL_fmeasure": 0.10948225690231367, "rougeL_fmeasure_stderr": 0.0017896005091475441, "rougeL_precision": 0.07035789575974258, "rougeL_precision_stderr": 0.0013479485808054492, "rougeL_recall": 0.346088142522385, "rougeL_recall_stderr": 0.004573219497516567, "rougeLsum_fmeasure": 0.11040902543720352, "rougeLsum_fmeasure_stderr": 0.001845965020973935, "rougeLsum_precision": 0.07105991740872057, "rougeLsum_precision_stderr": 0.0014021861498776512, "rougeLsum_recall": 0.34990113897565156, "rougeLsum_recall_stderr": 0.004653894399099468}}, "3": {"PALM_prompt": {"bleu": 0.49330582546586715, "bleu_stderr": 0.026760635766622045, "rouge1_fmeasure": 0.11255875083875559, "rouge1_fmeasure_stderr": 0.0018899560091881686, "rouge1_precision": 0.07197595815311898, "rouge1_precision_stderr": 0.0014085873060936218, "rouge1_recall": 0.36596279105771895, "rouge1_recall_stderr": 0.005060110953265633, "rouge2_fmeasure": 0.05116400683224674, "rouge2_fmeasure_stderr": 0.0011637012566298453, "rouge2_precision": 0.032559872031436, "rouge2_precision_stderr": 0.0008258506188369471, "rouge2_recall": 0.17582832452926037, "rouge2_recall_stderr": 0.0036102167860794358, "rougeL_fmeasure": 0.1053125965181327, "rougeL_fmeasure_stderr": 0.0017380106035689842, "rougeL_precision": 0.06730885297435893, "rougeL_precision_stderr": 0.0012874209586825013, "rougeL_recall": 0.3410263606925172, "rougeL_recall_stderr": 0.004632395084547858, "rougeLsum_fmeasure": 0.10651933217629937, "rougeLsum_fmeasure_stderr": 0.0017835590190595765, "rougeLsum_precision": 0.06816617828713667, "rougeLsum_precision_stderr": 0.0013317358072988438, "rougeLsum_recall": 0.3452024329512138, "rougeLsum_recall_stderr": 0.0046680401031925535}}, "4": {"PALM_prompt": {"bleu": 0.5805875464248518, "bleu_stderr": 0.02467601476988524, "rouge1_fmeasure": 0.1185079312492515, "rouge1_fmeasure_stderr": 0.0019283588373999788, "rouge1_precision": 0.0759876905383805, "rouge1_precision_stderr": 0.0014999294305126894, "rouge1_recall": 0.38441082579620744, "rouge1_recall_stderr": 0.005176874183483729, "rouge2_fmeasure": 0.0543334322796211, "rouge2_fmeasure_stderr": 0.001167839949370806, "rouge2_precision": 0.034693214028384364, "rouge2_precision_stderr": 0.0008716169777471336, "rouge2_recall": 0.18834133037111078, "rouge2_recall_stderr": 0.0037005417550254813, "rougeL_fmeasure": 0.10928166815352991, "rougeL_fmeasure_stderr": 0.0017079278026699558, "rougeL_precision": 0.06996833075999682, "rougeL_precision_stderr": 0.0013122027020668525, "rougeL_recall": 0.35440915717841187, "rougeL_recall_stderr": 0.0046597414750728676, "rougeLsum_fmeasure": 0.11155898719590468, "rougeLsum_fmeasure_stderr": 0.0018082007870370246, "rougeLsum_precision": 0.07161964114325686, "rougeLsum_precision_stderr": 0.00141486011529458, "rougeLsum_recall": 0.36094525512943, "rougeLsum_recall_stderr": 0.004743097077998619}}, "5": {"PALM_prompt": {"bleu": 0.6262213710856557, "bleu_stderr": 0.03220830109957928, "rouge1_fmeasure": 0.11957471813564298, "rouge1_fmeasure_stderr": 0.0018632136844397324, "rouge1_precision": 0.07568606204419236, "rouge1_precision_stderr": 0.0013727260179330872, "rouge1_recall": 0.4004498860844127, "rouge1_recall_stderr": 0.005164039996437608, "rouge2_fmeasure": 0.054406851908666655, "rouge2_fmeasure_stderr": 0.0011337345810570566, "rouge2_precision": 0.03424858760150629, "rouge2_precision_stderr": 0.0007988668794381154, "rouge2_recall": 0.19500849332230508, "rouge2_recall_stderr": 0.003726245247160497, "rougeL_fmeasure": 0.10937979253378736, "rougeL_fmeasure_stderr": 0.0016473303165533246, "rougeL_precision": 0.06919783619593119, "rougeL_precision_stderr": 0.0012097192783486648, "rougeL_recall": 0.3665037791378817, "rougeL_recall_stderr": 0.004626331345429801, "rougeLsum_fmeasure": 0.11268811047269227, "rougeLsum_fmeasure_stderr": 0.0017367441620354863, "rougeLsum_precision": 0.07131226400764427, "rougeLsum_precision_stderr": 0.0012731468955373129, "rougeLsum_recall": 0.37701365596712055, "rougeLsum_recall_stderr": 0.004768186337938744}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4085781866227987, "bleu_stderr": 0.06765316931490462, "rouge1_fmeasure": 0.1735271187409542, "rouge1_fmeasure_stderr": 0.0017420641997891633, "rouge1_precision": 0.14862761907514144, "rouge1_precision_stderr": 0.0018170184325446725, "rouge1_recall": 0.25162969277056396, "rouge1_recall_stderr": 0.0025110750991919678, "rouge2_fmeasure": 0.03281166114649338, "rouge2_fmeasure_stderr": 0.0007678027921869415, "rouge2_precision": 0.027931710583823308, "rouge2_precision_stderr": 0.0006852910799466808, "rouge2_recall": 0.04947292517867473, "rouge2_recall_stderr": 0.001323152681280402, "rougeL_fmeasure": 0.13211424580114145, "rougeL_fmeasure_stderr": 0.0012189992528884612, "rougeL_precision": 0.11167291160760119, "rougeL_precision_stderr": 0.0012338979014086847, "rougeL_recall": 0.1967670614485443, "rougeL_recall_stderr": 0.0020238976303286155, "rougeLsum_fmeasure": 0.16049275586848605, "rougeLsum_fmeasure_stderr": 0.0015915307859795458, "rougeLsum_precision": 0.13726434831918505, "rougeLsum_precision_stderr": 0.001658580287033731, "rougeLsum_recall": 0.23359610303376463, "rougeLsum_recall_stderr": 0.0023331163283850007}}, "1": {"tldr_en": {"bleu": 1.8695588017385163, "bleu_stderr": 0.05321662312142784, "rouge1_fmeasure": 0.18920766306464926, "rouge1_fmeasure_stderr": 0.0019111483043418977, "rouge1_precision": 0.16277193183652522, "rouge1_precision_stderr": 0.0019791861140128785, "rouge1_recall": 0.27320755514294287, "rouge1_recall_stderr": 0.0027529027179660705, "rouge2_fmeasure": 0.03952348883731413, "rouge2_fmeasure_stderr": 0.0009017689028399563, "rouge2_precision": 0.03377477444135655, "rouge2_precision_stderr": 0.0008162143350541909, "rouge2_recall": 0.05888454857622772, "rouge2_recall_stderr": 0.0014868106734162374, "rougeL_fmeasure": 0.14124032964831484, "rougeL_fmeasure_stderr": 0.0012998701147887161, "rougeL_precision": 0.12024619489337642, "rougeL_precision_stderr": 0.0013281691582764977, "rougeL_recall": 0.2091152162389091, "rougeL_recall_stderr": 0.00215651070348592, "rougeLsum_fmeasure": 0.17618136263300152, "rougeLsum_fmeasure_stderr": 0.0017768481626556554, "rougeLsum_precision": 0.15124075642915727, "rougeLsum_precision_stderr": 0.001833497711400087, "rougeLsum_recall": 0.2556042591652215, "rougeLsum_recall_stderr": 0.002599027550285205}}, "2": {"tldr_en": {"bleu": 2.0895250564263526, "bleu_stderr": 0.05260673114945821, "rouge1_fmeasure": 0.19547280360886987, "rouge1_fmeasure_stderr": 0.001894454328895725, "rouge1_precision": 0.1705052900654322, "rouge1_precision_stderr": 0.0020623873190335836, "rouge1_recall": 0.27934387948105693, "rouge1_recall_stderr": 0.00269250839204282, "rouge2_fmeasure": 0.04385052776628551, "rouge2_fmeasure_stderr": 0.0009414966007970309, "rouge2_precision": 0.03848650920945066, "rouge2_precision_stderr": 0.000929187611532398, "rouge2_recall": 0.06443059864859675, "rouge2_recall_stderr": 0.0015228056774988696, "rougeL_fmeasure": 0.147954778208212, "rougeL_fmeasure_stderr": 0.0013306013768007523, "rougeL_precision": 0.12791269972430605, "rougeL_precision_stderr": 0.0014524157223196957, "rougeL_recall": 0.2160525397509754, "rougeL_recall_stderr": 0.0021509404278420674, "rougeLsum_fmeasure": 0.18149103619605517, "rougeLsum_fmeasure_stderr": 0.0017456373642829746, "rougeLsum_precision": 0.15803063427007255, "rougeLsum_precision_stderr": 0.0019019671150593402, "rougeLsum_recall": 0.26032488790855124, "rougeLsum_recall_stderr": 0.0025259372632087294}}, "3": {"tldr_en": {"bleu": 1.9845021837810688, "bleu_stderr": 0.06643615389489499, "rouge1_fmeasure": 0.1627969919759764, "rouge1_fmeasure_stderr": 0.002102839789628136, "rouge1_precision": 0.14794605221202178, "rouge1_precision_stderr": 0.0023288106629145966, "rouge1_recall": 0.23196887225200322, "rouge1_recall_stderr": 0.0030906899098327623, "rouge2_fmeasure": 0.03576925556291925, "rouge2_fmeasure_stderr": 0.0008895897244987175, "rouge2_precision": 0.03234477098093676, "rouge2_precision_stderr": 0.0009482837463457055, "rouge2_recall": 0.052607551971036656, "rouge2_recall_stderr": 0.001441559167188345, "rougeL_fmeasure": 0.12292479290951157, "rougeL_fmeasure_stderr": 0.0015261254326798437, "rougeL_precision": 0.11098220652021759, "rougeL_precision_stderr": 0.0017249339058787237, "rougeL_recall": 0.17882334047937634, "rougeL_recall_stderr": 0.002441176945496047, "rougeLsum_fmeasure": 0.15163244869667974, "rougeLsum_fmeasure_stderr": 0.001949601678988161, "rougeLsum_precision": 0.1379056588631655, "rougeLsum_precision_stderr": 0.002188641876469295, "rougeLsum_recall": 0.2165921381447695, "rougeLsum_recall_stderr": 0.002892803013438996}}, "4": {"tldr_en": {"bleu": 0.43692124770591884, "bleu_stderr": 0.034589878125136274, "rouge1_fmeasure": 0.05320898989288258, "rouge1_fmeasure_stderr": 0.0018000803783716336, "rouge1_precision": 0.05106771575902374, "rouge1_precision_stderr": 0.0019659203953794374, "rouge1_recall": 0.07804501442013374, "rouge1_recall_stderr": 0.002681543421696101, "rouge2_fmeasure": 0.0115081158992026, "rouge2_fmeasure_stderr": 0.0005924515945289618, "rouge2_precision": 0.010915560953831725, "rouge2_precision_stderr": 0.0006843232384822272, "rouge2_recall": 0.017333650821899818, "rouge2_recall_stderr": 0.0009520410019829541, "rougeL_fmeasure": 0.0406022993863923, "rougeL_fmeasure_stderr": 0.001350639486420207, "rougeL_precision": 0.03912914603871578, "rougeL_precision_stderr": 0.0015292266274958602, "rougeL_recall": 0.06051029913811969, "rougeL_recall_stderr": 0.002087906460503723, "rougeLsum_fmeasure": 0.04978211979263513, "rougeLsum_fmeasure_stderr": 0.0016808277076301248, "rougeLsum_precision": 0.04795995322666103, "rougeLsum_precision_stderr": 0.0018654956003536299, "rougeLsum_recall": 0.0730595855068419, "rougeLsum_recall_stderr": 0.002511680618409503}}, "5": {"tldr_en": {"bleu": 8.693135496106579e-07, "bleu_stderr": 2.0347209537079987e-06, "rouge1_fmeasure": 0.009154755966022539, "rouge1_fmeasure_stderr": 0.0008437188329211569, "rouge1_precision": 0.008888598534123287, "rouge1_precision_stderr": 0.0009074304457843781, "rouge1_recall": 0.013732262968419945, "rouge1_recall_stderr": 0.0012775310603661385, "rouge2_fmeasure": 0.0019115778864576577, "rouge2_fmeasure_stderr": 0.0002415435510558745, "rouge2_precision": 0.0020329208663477155, "rouge2_precision_stderr": 0.0004034253796867396, "rouge2_recall": 0.0030032403059095103, "rouge2_recall_stderr": 0.0004244983130075312, "rougeL_fmeasure": 0.006902885273901401, "rougeL_fmeasure_stderr": 0.0006295358749076249, "rougeL_precision": 0.006715096170350463, "rougeL_precision_stderr": 0.000704577952426742, "rougeL_recall": 0.010603012416762791, "rougeL_recall_stderr": 0.0010070501511240425, "rougeLsum_fmeasure": 0.008435486741119774, "rougeLsum_fmeasure_stderr": 0.0007773340965961406, "rougeLsum_precision": 0.00825151495932399, "rougeLsum_precision_stderr": 0.0008497435087111472, "rougeLsum_recall": 0.0126360914173471, "rougeLsum_recall_stderr": 0.0011775115731198504}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.015996891880542208, "bleu_stderr": 0.004310800871207989, "rouge1_fmeasure": 0.01581057979538024, "rouge1_fmeasure_stderr": 0.0003268681446111073, "rouge1_precision": 0.012607142857143292, "rouge1_precision_stderr": 0.0002620854055855113, "rouge1_recall": 0.022364589648673473, "rouge1_recall_stderr": 0.0004833744939072677, "rouge2_fmeasure": 0.00012949433318118156, "rouge2_fmeasure_stderr": 3.494334474550083e-05, "rouge2_precision": 0.00011382113821138212, "rouge2_precision_stderr": 3.0353975217624302e-05, "rouge2_recall": 0.00016007109878338666, "rouge2_recall_stderr": 4.471587603586543e-05, "rougeL_fmeasure": 0.01581057979538024, "rougeL_fmeasure_stderr": 0.0003268681446111073, "rougeL_precision": 0.012607142857143292, "rougeL_precision_stderr": 0.0002620854055855113, "rougeL_recall": 0.022364589648673473, "rougeL_recall_stderr": 0.0004833744939072677, "rougeLsum_fmeasure": 0.015304580133401937, "rougeLsum_fmeasure_stderr": 0.00030990342484444515, "rougeLsum_precision": 0.012202380952381383, "rougeLsum_precision_stderr": 0.00024852907150074173, "rougeLsum_recall": 0.02165931137163564, "rougeLsum_recall_stderr": 0.000459555205796444}}, "1": {"generate_text_restaurant": {"bleu": 10.160164880301407, "bleu_stderr": 0.09870361519554652, "rouge1_fmeasure": 0.4183603415646924, "rouge1_fmeasure_stderr": 0.0021915886347872913, "rouge1_precision": 0.4723645042246602, "rouge1_precision_stderr": 0.003093544138341839, "rouge1_recall": 0.41769731726334886, "rouge1_recall_stderr": 0.0027761936199660036, "rouge2_fmeasure": 0.18224154640367043, "rouge2_fmeasure_stderr": 0.001744075951680649, "rouge2_precision": 0.20824659603338538, "rouge2_precision_stderr": 0.0022062119262852707, "rouge2_recall": 0.1819755290759944, "rouge2_recall_stderr": 0.0019466842287792123, "rougeL_fmeasure": 0.29563040377094635, "rougeL_fmeasure_stderr": 0.0018175171603569897, "rougeL_precision": 0.3351327678327539, "rougeL_precision_stderr": 0.0025523703114727794, "rougeL_recall": 0.295579474348389, "rougeL_recall_stderr": 0.0022293299165326882, "rougeLsum_fmeasure": 0.33772472890005967, "rougeLsum_fmeasure_stderr": 0.0020840945335635214, "rougeLsum_precision": 0.38216044781782504, "rougeLsum_precision_stderr": 0.002840674624973531, "rougeLsum_recall": 0.33669899123255476, "rougeLsum_recall_stderr": 0.002497483152367024}}, "2": {"generate_text_restaurant": {"bleu": 11.639342595686514, "bleu_stderr": 0.1372398556772298, "rouge1_fmeasure": 0.4403748840276186, "rouge1_fmeasure_stderr": 0.0021469015406322322, "rouge1_precision": 0.5008112994914655, "rouge1_precision_stderr": 0.0030901508544956993, "rouge1_recall": 0.4331586166619688, "rouge1_recall_stderr": 0.0027669107693439566, "rouge2_fmeasure": 0.20392339700886974, "rouge2_fmeasure_stderr": 0.0017713110997826917, "rouge2_precision": 0.23532637411562363, "rouge2_precision_stderr": 0.0023540846426583248, "rouge2_recall": 0.20098225485559104, "rouge2_recall_stderr": 0.002000098072621399, "rougeL_fmeasure": 0.3101075960976144, "rougeL_fmeasure_stderr": 0.0018538357828329025, "rougeL_precision": 0.3541570070163828, "rougeL_precision_stderr": 0.0026216980786301844, "rougeL_recall": 0.30536324071501125, "rougeL_recall_stderr": 0.0022959263532433085, "rougeLsum_fmeasure": 0.3568310755537105, "rougeLsum_fmeasure_stderr": 0.0021021781653240523, "rougeLsum_precision": 0.4060963348848002, "rougeLsum_precision_stderr": 0.00287308069523248, "rougeLsum_recall": 0.35125355708974765, "rougeLsum_recall_stderr": 0.002574327739391992}}, "3": {"generate_text_restaurant": {"bleu": 12.12151131378298, "bleu_stderr": 0.11915100827089956, "rouge1_fmeasure": 0.4393124622349425, "rouge1_fmeasure_stderr": 0.002105146952485857, "rouge1_precision": 0.48319371001463535, "rouge1_precision_stderr": 0.002959676881031021, "rouge1_recall": 0.4422942530679121, "rouge1_recall_stderr": 0.002745785251032174, "rouge2_fmeasure": 0.20537187889646735, "rouge2_fmeasure_stderr": 0.0017679657376305042, "rouge2_precision": 0.22776533967342907, "rouge2_precision_stderr": 0.002199578780588574, "rouge2_recall": 0.20780683933237928, "rouge2_recall_stderr": 0.0020519338118908715, "rougeL_fmeasure": 0.30827570526127646, "rougeL_fmeasure_stderr": 0.001838483091209031, "rougeL_precision": 0.34008741895966316, "rougeL_precision_stderr": 0.002497623460553464, "rougeL_recall": 0.3107412356396501, "rougeL_recall_stderr": 0.0022904606355026516, "rougeLsum_fmeasure": 0.3590927766034753, "rougeLsum_fmeasure_stderr": 0.0020747916723764977, "rougeLsum_precision": 0.3948986426501321, "rougeLsum_precision_stderr": 0.0027465008011313763, "rougeLsum_recall": 0.36193901740453144, "rougeLsum_recall_stderr": 0.0025838482295284003}}, "4": {"generate_text_restaurant": {"bleu": 11.559015928325927, "bleu_stderr": 0.18803967369078045, "rouge1_fmeasure": 0.4317309670284675, "rouge1_fmeasure_stderr": 0.002056102439511285, "rouge1_precision": 0.45547982198924447, "rouge1_precision_stderr": 0.0028623291621968612, "rouge1_recall": 0.4496956638404531, "rouge1_recall_stderr": 0.002644952608966691, "rouge2_fmeasure": 0.200121282410508, "rouge2_fmeasure_stderr": 0.001720634914837748, "rouge2_precision": 0.21266205937348281, "rouge2_precision_stderr": 0.0020948347718174865, "rouge2_recall": 0.2094291861066839, "rouge2_recall_stderr": 0.0020099108277091484, "rougeL_fmeasure": 0.3022050091930064, "rougeL_fmeasure_stderr": 0.001805017629409357, "rougeL_precision": 0.31926641390021887, "rougeL_precision_stderr": 0.0023710826604788795, "rougeL_recall": 0.3157467625853494, "rougeL_recall_stderr": 0.0022866495141614174, "rougeLsum_fmeasure": 0.3556255169379216, "rougeLsum_fmeasure_stderr": 0.0020739273107426392, "rougeLsum_precision": 0.3747187674171436, "rougeLsum_precision_stderr": 0.002662920050950552, "rougeLsum_recall": 0.37116215718680456, "rougeLsum_recall_stderr": 0.0025805030574394894}}, "5": {"generate_text_restaurant": {"bleu": 10.718543658577635, "bleu_stderr": 0.13557644877495797, "rouge1_fmeasure": 0.4260986497991647, "rouge1_fmeasure_stderr": 0.0019558770098863274, "rouge1_precision": 0.43624590990474293, "rouge1_precision_stderr": 0.002755359910046268, "rouge1_recall": 0.45743821166090926, "rouge1_recall_stderr": 0.0025961821255812874, "rouge2_fmeasure": 0.19429987532612708, "rouge2_fmeasure_stderr": 0.0016645032075996245, "rouge2_precision": 0.19962516392680765, "rouge2_precision_stderr": 0.0019681113841036675, "rouge2_recall": 0.21014052881618187, "rouge2_recall_stderr": 0.001996018471125935, "rougeL_fmeasure": 0.29713861834051897, "rougeL_fmeasure_stderr": 0.0017327548341535156, "rougeL_precision": 0.30445962134555027, "rougeL_precision_stderr": 0.0022570971354846756, "rougeL_recall": 0.319832443105098, "rougeL_recall_stderr": 0.002242049253249801, "rougeLsum_fmeasure": 0.3541205313176406, "rougeLsum_fmeasure_stderr": 0.0019745070250081604, "rougeLsum_precision": 0.36210497757070403, "rougeLsum_precision_stderr": 0.0025479433447278678, "rougeLsum_recall": 0.38092539069745174, "rougeLsum_recall_stderr": 0.00253423877318417}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.893091380773236, "bleu_stderr": 0.07155879041797279, "rouge1_fmeasure": 0.2061985807221383, "rouge1_fmeasure_stderr": 0.0025614797423455142, "rouge1_precision": 0.1594590911124501, "rouge1_precision_stderr": 0.0022782733083245854, "rouge1_recall": 0.3283512452700197, "rouge1_recall_stderr": 0.004306520729443716, "rouge2_fmeasure": 0.044020752612575986, "rouge2_fmeasure_stderr": 0.0015514733272418211, "rouge2_precision": 0.03310698595501658, "rouge2_precision_stderr": 0.001224021675086901, "rouge2_recall": 0.07342630951021904, "rouge2_recall_stderr": 0.0026234650563968604, "rougeL_fmeasure": 0.15357121051296377, "rougeL_fmeasure_stderr": 0.0019035866015213703, "rougeL_precision": 0.11840027221905174, "rougeL_precision_stderr": 0.00166823447110928, "rougeL_recall": 0.24641286838690638, "rougeL_recall_stderr": 0.003356144965161673, "rougeLsum_fmeasure": 0.15982417698859158, "rougeLsum_fmeasure_stderr": 0.0021047185959146048, "rougeLsum_precision": 0.12282873435732601, "rougeLsum_precision_stderr": 0.0017745649485105812, "rougeLsum_recall": 0.25756924529549197, "rougeLsum_recall_stderr": 0.0037577915871248534}}, "1": {"article_DOC_summary": {"bleu": 1.1760544673245734, "bleu_stderr": 0.08133234405781672, "rouge1_fmeasure": 0.1644736490786607, "rouge1_fmeasure_stderr": 0.002289825635722979, "rouge1_precision": 0.11666645790427581, "rouge1_precision_stderr": 0.0017014755837046706, "rouge1_recall": 0.28978527393441794, "rouge1_recall_stderr": 0.0038991635104653646, "rouge2_fmeasure": 0.02968537882036386, "rouge2_fmeasure_stderr": 0.0012639179125521537, "rouge2_precision": 0.020871037943975598, "rouge2_precision_stderr": 0.0008902042501889368, "rouge2_recall": 0.05355457980003852, "rouge2_recall_stderr": 0.0023314697329144688, "rougeL_fmeasure": 0.12816828557628004, "rougeL_fmeasure_stderr": 0.0017421199503176153, "rougeL_precision": 0.09068714229050606, "rougeL_precision_stderr": 0.0012801677393527455, "rougeL_recall": 0.22742162679707073, "rougeL_recall_stderr": 0.0030968609463424912, "rougeLsum_fmeasure": 0.13227157946601684, "rougeLsum_fmeasure_stderr": 0.0018913833448635142, "rougeLsum_precision": 0.09360786095706375, "rougeLsum_precision_stderr": 0.0013862407744166395, "rougeLsum_recall": 0.23441366999836272, "rougeLsum_recall_stderr": 0.0033283603169613893}}, "2": {"article_DOC_summary": {"bleu": 1.1923931410727253, "bleu_stderr": 0.096157535871299, "rouge1_fmeasure": 0.16654675192354085, "rouge1_fmeasure_stderr": 0.0022693587032095544, "rouge1_precision": 0.11788198252469814, "rouge1_precision_stderr": 0.0016840268338500827, "rouge1_recall": 0.29510022232622024, "rouge1_recall_stderr": 0.003931015696733789, "rouge2_fmeasure": 0.030040223235727605, "rouge2_fmeasure_stderr": 0.001269654630895902, "rouge2_precision": 0.02100747266618839, "rouge2_precision_stderr": 0.0008812797803659204, "rouge2_recall": 0.05497523294613044, "rouge2_recall_stderr": 0.0024458906023431502, "rougeL_fmeasure": 0.13083582888851075, "rougeL_fmeasure_stderr": 0.0017198624344877728, "rougeL_precision": 0.09243404834906441, "rougeL_precision_stderr": 0.0012651350635372006, "rougeL_recall": 0.23303993493040487, "rougeL_recall_stderr": 0.003075264655112234, "rougeLsum_fmeasure": 0.13353807347880225, "rougeLsum_fmeasure_stderr": 0.001873990491132275, "rougeLsum_precision": 0.09431829617737796, "rougeLsum_precision_stderr": 0.0013719159321866803, "rougeLsum_recall": 0.23802791061424466, "rougeLsum_recall_stderr": 0.0033649952401285928}}, "3": {"article_DOC_summary": {"bleu": 1.3191345882756016, "bleu_stderr": 0.09421067837162811, "rouge1_fmeasure": 0.16270135093872945, "rouge1_fmeasure_stderr": 0.0024951785759190037, "rouge1_precision": 0.11793417351785487, "rouge1_precision_stderr": 0.002017377276366173, "rouge1_recall": 0.2836315744233739, "rouge1_recall_stderr": 0.004290159570320814, "rouge2_fmeasure": 0.030050510110479087, "rouge2_fmeasure_stderr": 0.0013009231624033775, "rouge2_precision": 0.021590058499460735, "rouge2_precision_stderr": 0.000983046658889197, "rouge2_recall": 0.05387363083458545, "rouge2_recall_stderr": 0.0023820277892373596, "rougeL_fmeasure": 0.1279568957546, "rougeL_fmeasure_stderr": 0.0018961537785853696, "rougeL_precision": 0.09245687482962514, "rougeL_precision_stderr": 0.0015075983809124832, "rougeL_recall": 0.22467473031951235, "rougeL_recall_stderr": 0.003391085028163447, "rougeLsum_fmeasure": 0.13073618241924947, "rougeLsum_fmeasure_stderr": 0.0020835700028823006, "rougeLsum_precision": 0.09456074152436214, "rougeLsum_precision_stderr": 0.0016592694935628211, "rougeLsum_recall": 0.22933200389418706, "rougeLsum_recall_stderr": 0.0036933254855523244}}, "4": {"article_DOC_summary": {"bleu": 0.5682110373693539, "bleu_stderr": 0.08502122706344573, "rouge1_fmeasure": 0.04429948639681006, "rouge1_fmeasure_stderr": 0.0024800638787534734, "rouge1_precision": 0.0367038903617586, "rouge1_precision_stderr": 0.002159913173134104, "rouge1_recall": 0.06993097002986987, "rouge1_recall_stderr": 0.003981889947602205, "rouge2_fmeasure": 0.007253870650249119, "rouge2_fmeasure_stderr": 0.0007371032526022674, "rouge2_precision": 0.005663786671792409, "rouge2_precision_stderr": 0.0006131630864231121, "rouge2_recall": 0.011891116216861553, "rouge2_recall_stderr": 0.0012302505001022532, "rougeL_fmeasure": 0.03463725804962344, "rougeL_fmeasure_stderr": 0.0019206641793292863, "rougeL_precision": 0.029073862632456978, "rougeL_precision_stderr": 0.0017427910212302695, "rougeL_recall": 0.05493887124617477, "rougeL_recall_stderr": 0.00313835664453676, "rougeLsum_fmeasure": 0.035650820163199474, "rougeLsum_fmeasure_stderr": 0.002013697777536067, "rougeLsum_precision": 0.02992033353583286, "rougeLsum_precision_stderr": 0.0018236172991812475, "rougeLsum_recall": 0.056338516909847834, "rougeLsum_recall_stderr": 0.0032573814363183865}}, "5": {"article_DOC_summary": {"bleu": 7.234219387755872e-40, "bleu_stderr": 8.513383008510971e-35, "rouge1_fmeasure": 0.0021354726500630848, "rouge1_fmeasure_stderr": 0.0005598260412101052, "rouge1_precision": 0.0023302457662671455, "rouge1_precision_stderr": 0.0006112488722810845, "rouge1_recall": 0.002024827111990807, "rouge1_recall_stderr": 0.0005399672158721946, "rouge2_fmeasure": 8.848593754254134e-05, "rouge2_fmeasure_stderr": 6.272722707705515e-05, "rouge2_precision": 0.00010405105438401775, "rouge2_precision_stderr": 7.357743885200423e-05, "rouge2_recall": 7.71869639794168e-05, "rouge2_recall_stderr": 5.489230344136828e-05, "rougeL_fmeasure": 0.00164411367748071, "rougeL_fmeasure_stderr": 0.0004347092185887277, "rougeL_precision": 0.0018006566925166142, "rougeL_precision_stderr": 0.000476195417495383, "rougeL_recall": 0.001551384791989109, "rougeL_recall_stderr": 0.0004151810881735085, "rougeLsum_fmeasure": 0.0017350747461433223, "rougeLsum_fmeasure_stderr": 0.00045807565970479823, "rougeLsum_precision": 0.001898751962642961, "rougeLsum_precision_stderr": 0.0005005472882701675, "rougeLsum_recall": 0.001637972732350634, "rougeLsum_recall_stderr": 0.0004381341521417912}}}} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0.csv b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..43c0cd6fa6fb2a0800120e772624f2a9aa23b7b4 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.338,0.014965960710224485,0 +anli_r2,acc,0.333,0.014910846164229863,0 +anli_r3,acc,0.3358333333333333,0.013639261190932882,0 +arc_challenge,acc,0.2440273037542662,0.01255144762785626,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097224,0 +arc_easy,acc,0.5673400673400674,0.010166307932642867,0 +arc_easy,acc_norm,0.4962121212121212,0.010259489101351847,0 +boolq,acc,0.5978593272171254,0.008575926383211252,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.21777777777777776,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4374626568412667,0.004950598300667558,0 +hellaswag,acc_norm,0.5612427803226449,0.004952209831856566,0 +piqa,acc,0.733949945593036,0.010310039263352831,0 +piqa,acc_norm,0.7372143634385201,0.010269354068140777,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.805,0.012535235623319322,0 +sciq,acc_norm,0.717,0.014251810906481742,0 +storycloze_2016,acc,0.6916087653661144,0.010679734445487796,0 +winogrande,acc,0.5603788476716653,0.013949649776015689,0 diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json deleted file mode 100644 index fb69a8c9fdc9fc34f20d15ac33bb5edc83616f9f..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.014965960710224485 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229863 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932882 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.21777777777777776 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4374626568412667, - "acc_stderr": 0.004950598300667558, - "acc_norm": 0.5612427803226449, - "acc_norm_stderr": 0.004952209831856566 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5603788476716653, - "acc_stderr": 0.013949649776015689 - }, - "storycloze_2016": { - "acc": 0.6916087653661144, - "acc_stderr": 0.010679734445487796 - }, - "boolq": { - "acc": 0.5978593272171254, - "acc_stderr": 0.008575926383211252 - }, - "arc_easy": { - "acc": 0.5673400673400674, - "acc_stderr": 0.010166307932642867, - "acc_norm": 0.4962121212121212, - "acc_norm_stderr": 0.010259489101351847 - }, - "arc_challenge": { - "acc": 0.2440273037542662, - "acc_stderr": 0.01255144762785626, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097224 - }, - "sciq": { - "acc": 0.805, - "acc_stderr": 0.012535235623319322, - "acc_norm": 0.717, - "acc_norm_stderr": 0.014251810906481742 - }, - "piqa": { - "acc": 0.733949945593036, - "acc_stderr": 0.010310039263352831, - "acc_norm": 0.7372143634385201, - "acc_norm_stderr": 0.010269354068140777 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1.csv b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..2d42d86bee02f5c1099627cc013f4384190a8101 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.344,0.015029633724408947,0 +anli_r2,acc,0.333,0.01491084616422986,0 +anli_r3,acc,0.35333333333333333,0.01380457216231493,0 +arc_challenge,acc,0.24658703071672355,0.012595726268790124,0 +arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0 +arc_easy,acc,0.5765993265993266,0.01013867100528905,0 +arc_easy,acc_norm,0.5361952861952862,0.010232865550346736,0 +boolq,acc,0.5807339449541284,0.008630302070999097,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.37714285714285706,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.433379804819757,0.00494529127007243,0 +hellaswag,acc_norm,0.5638319059948218,0.00494895251951751,0 +piqa,acc,0.7399347116430903,0.01023489324906129,0 +piqa,acc_norm,0.736126224156692,0.01028299636769557,0 +rte,acc,0.5379061371841155,0.030009848912529113,0 +sciq,acc,0.846,0.011419913065098706,0 +sciq,acc_norm,0.824,0.012048616898597498,0 +storycloze_2016,acc,0.6803848209513629,0.01078375973373075,0 +winogrande,acc,0.5430149960536701,0.01400038676159829,0 diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json deleted file mode 100644 index 112bf6a579f6d294c6c77a501ab5f4779237e924..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408947 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.01491084616422986 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.01380457216231493 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.37714285714285706 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.433379804819757, - "acc_stderr": 0.00494529127007243, - "acc_norm": 0.5638319059948218, - "acc_norm_stderr": 0.00494895251951751 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.5430149960536701, - "acc_stderr": 0.01400038676159829 - }, - "storycloze_2016": { - "acc": 0.6803848209513629, - "acc_stderr": 0.01078375973373075 - }, - "boolq": { - "acc": 0.5807339449541284, - "acc_stderr": 0.008630302070999097 - }, - "arc_easy": { - "acc": 0.5765993265993266, - "acc_stderr": 0.01013867100528905, - "acc_norm": 0.5361952861952862, - "acc_norm_stderr": 0.010232865550346736 - }, - "arc_challenge": { - "acc": 0.24658703071672355, - "acc_stderr": 0.012595726268790124, - "acc_norm": 0.28924914675767915, - "acc_norm_stderr": 0.013250012579393443 - }, - "sciq": { - "acc": 0.846, - "acc_stderr": 0.011419913065098706, - "acc_norm": 0.824, - "acc_norm_stderr": 0.012048616898597498 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.01023489324906129, - "acc_norm": 0.736126224156692, - "acc_norm_stderr": 0.01028299636769557 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2.csv b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..cfb6b7973a1ad92810b09d7ee30447100c998464 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203943,0 +anli_r2,acc,0.329,0.01486539538592836,0 +anli_r3,acc,0.3466666666666667,0.013744022550571946,0 +arc_challenge,acc,0.25426621160409557,0.012724999945157744,0 +arc_challenge,acc_norm,0.29692832764505117,0.013352025976725223,0 +arc_easy,acc,0.5896464646464646,0.010093531255765457,0 +arc_easy,acc_norm,0.5496632996632996,0.010209047724374143,0 +boolq,acc,0.5767584097859327,0.008641391399113584,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.27939042089985483,,1 +copa,acc,0.76,0.042923469599092816,0 +hellaswag,acc,0.43019318860784705,0.004940911779273377,0 +hellaswag,acc_norm,0.5633339972117108,0.00494958956767889,0 +piqa,acc,0.7328618063112078,0.010323440492612437,0 +piqa,acc_norm,0.7274211099020674,0.010389256803296007,0 +rte,acc,0.51985559566787,0.030072723167317184,0 +sciq,acc,0.869,0.010674874844837957,0 +sciq,acc_norm,0.854,0.011171786285496497,0 +storycloze_2016,acc,0.6873329770176376,0.010720223172953165,0 +winogrande,acc,0.5580110497237569,0.013957584079108997,0 diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json deleted file mode 100644 index 60c52b2b84c20f107100535bd1829bfdfcddedb9..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203943 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.01486539538592836 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.013744022550571946 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.27939042089985483 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.042923469599092816 - }, - "hellaswag": { - "acc": 0.43019318860784705, - "acc_stderr": 0.004940911779273377, - "acc_norm": 0.5633339972117108, - "acc_norm_stderr": 0.00494958956767889 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317184 - }, - "winogrande": { - "acc": 0.5580110497237569, - "acc_stderr": 0.013957584079108997 - }, - "storycloze_2016": { - "acc": 0.6873329770176376, - "acc_stderr": 0.010720223172953165 - }, - "boolq": { - "acc": 0.5767584097859327, - "acc_stderr": 0.008641391399113584 - }, - "arc_easy": { - "acc": 0.5896464646464646, - "acc_stderr": 0.010093531255765457, - "acc_norm": 0.5496632996632996, - "acc_norm_stderr": 0.010209047724374143 - }, - "arc_challenge": { - "acc": 0.25426621160409557, - "acc_stderr": 0.012724999945157744, - "acc_norm": 0.29692832764505117, - "acc_norm_stderr": 0.013352025976725223 - }, - "sciq": { - "acc": 0.869, - "acc_stderr": 0.010674874844837957, - "acc_norm": 0.854, - "acc_norm_stderr": 0.011171786285496497 - }, - "piqa": { - "acc": 0.7328618063112078, - "acc_stderr": 0.010323440492612437, - "acc_norm": 0.7274211099020674, - "acc_norm_stderr": 0.010389256803296007 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3.csv b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..58f51cf87e22228d8bb0544537b694fd01183696 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.341,0.014998131348402718,0 +anli_r2,acc,0.344,0.015029633724408945,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207706,0 +arc_challenge,acc_norm,0.2883959044368601,0.013238394422428171,0 +arc_easy,acc,0.5791245791245792,0.01013050216406633,0 +arc_easy,acc_norm,0.5618686868686869,0.010180937100600067,0 +boolq,acc,0.5837920489296636,0.008621380519419278,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.2988586070347077,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4304919338777136,0.004941331215598548,0 +hellaswag,acc_norm,0.563433578968333,0.004949462563681335,0 +piqa,acc,0.7383025027203483,0.010255630772708229,0 +piqa,acc_norm,0.7366702937976061,0.010276185322196764,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.868,0.010709373963528035,0 +sciq,acc_norm,0.865,0.010811655372416051,0 +storycloze_2016,acc,0.686798503474078,0.0107252094229294,0 +winogrande,acc,0.5493291239147593,0.01398392886904024,0 diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json deleted file mode 100644 index e6e7a9f9a1f4287a22b5629c842442e0af4428d6..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.341, - "acc_stderr": 0.014998131348402718 - }, - "anli_r2": { - "acc": 0.344, - "acc_stderr": 0.015029633724408945 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.2988586070347077 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4304919338777136, - "acc_stderr": 0.004941331215598548, - "acc_norm": 0.563433578968333, - "acc_norm_stderr": 0.004949462563681335 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5493291239147593, - "acc_stderr": 0.01398392886904024 - }, - "storycloze_2016": { - "acc": 0.686798503474078, - "acc_stderr": 0.0107252094229294 - }, - "boolq": { - "acc": 0.5837920489296636, - "acc_stderr": 0.008621380519419278 - }, - "arc_easy": { - "acc": 0.5791245791245792, - "acc_stderr": 0.01013050216406633, - "acc_norm": 0.5618686868686869, - "acc_norm_stderr": 0.010180937100600067 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207706, - "acc_norm": 0.2883959044368601, - "acc_norm_stderr": 0.013238394422428171 - }, - "sciq": { - "acc": 0.868, - "acc_stderr": 0.010709373963528035, - "acc_norm": 0.865, - "acc_norm_stderr": 0.010811655372416051 - }, - "piqa": { - "acc": 0.7383025027203483, - "acc_stderr": 0.010255630772708229, - "acc_norm": 0.7366702937976061, - "acc_norm_stderr": 0.010276185322196764 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_4.csv b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..9b823415d2219842a13591cd792f4fe970021c80 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.351,0.015100563798316405,0 +anli_r2,acc,0.337,0.01495508791865361,0 +anli_r3,acc,0.3258333333333333,0.013535422043417462,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.2935153583617747,0.013307250444941117,0 +arc_easy,acc,0.5942760942760943,0.010075755540128876,0 +arc_easy,acc_norm,0.5656565656565656,0.010170943451269421,0 +boolq,acc,0.5752293577981651,0.008645503833361105,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3516908212560386,,1 +copa,acc,0.74,0.04408440022768078,0 +hellaswag,acc,0.4282015534754033,0.004938068627349493,0 +hellaswag,acc_norm,0.5673172674765983,0.004944351065545864,0 +piqa,acc,0.735582154515778,0.01028978724476717,0 +piqa,acc_norm,0.7393906420021763,0.010241826155811635,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.876,0.010427498872343968,0 +sciq,acc_norm,0.867,0.010743669132397353,0 +storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0 +winogrande,acc,0.55327545382794,0.013972488371616697,0 diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json deleted file mode 100644 index 6fc99ffb0d11c9f1f763b1d876819fad940dca4a..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316405 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.01495508791865361 - }, - "anli_r3": { - "acc": 0.3258333333333333, - "acc_stderr": 0.013535422043417462 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3516908212560386 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768078 - }, - "hellaswag": { - "acc": 0.4282015534754033, - "acc_stderr": 0.004938068627349493, - "acc_norm": 0.5673172674765983, - "acc_norm_stderr": 0.004944351065545864 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.55327545382794, - "acc_stderr": 0.013972488371616697 - }, - "storycloze_2016": { - "acc": 0.6932121859967931, - "acc_stderr": 0.010664275190473634 - }, - "boolq": { - "acc": 0.5752293577981651, - "acc_stderr": 0.008645503833361105 - }, - "arc_easy": { - "acc": 0.5942760942760943, - "acc_stderr": 0.010075755540128876, - "acc_norm": 0.5656565656565656, - "acc_norm_stderr": 0.010170943451269421 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.2935153583617747, - "acc_norm_stderr": 0.013307250444941117 - }, - "sciq": { - "acc": 0.876, - "acc_stderr": 0.010427498872343968, - "acc_norm": 0.867, - "acc_norm_stderr": 0.010743669132397353 - }, - "piqa": { - "acc": 0.735582154515778, - "acc_stderr": 0.01028978724476717, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.010241826155811635 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_5.csv b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..9a34f63ff9a88c4d3780da180dbf010350a15815 --- /dev/null +++ b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.363,0.01521389044467128,0 +anli_r2,acc,0.331,0.014888272588203943,0 +anli_r3,acc,0.35333333333333333,0.013804572162314935,0 +arc_challenge,acc,0.2636518771331058,0.012875929151297053,0 +arc_challenge,acc_norm,0.2935153583617747,0.013307250444941118,0 +arc_easy,acc,0.5921717171717171,0.01008395024004121,0 +arc_easy,acc_norm,0.5686026936026936,0.010162752847747506,0 +boolq,acc,0.5761467889908257,0.008643046537505769,1 +cb,acc,0.5178571428571429,0.06737697508644648,1 +cb,f1,0.3561676082862524,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.43168691495717987,0.004942990623131122,0 +hellaswag,acc_norm,0.5683130850428202,0.004942990623131129,0 +piqa,acc,0.7285092491838956,0.010376251176596135,0 +piqa,acc_norm,0.7377584330794341,0.01026250256517244,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.879,0.010318210380946087,0 +sciq,acc_norm,0.882,0.010206869264381791,0 +storycloze_2016,acc,0.689470871191876,0.010700112173178448,0 +winogrande,acc,0.5382794001578532,0.014011242594964115,0 diff --git a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json b/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json deleted file mode 100644 index 042db2c8e06a6aa705a7059fce90342577d76541..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed1/evaluation/rankeval/2b855b14bc4seed1_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.363, - "acc_stderr": 0.01521389044467128 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.014888272588203943 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314935 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644648, - "f1": 0.3561676082862524 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.43168691495717987, - "acc_stderr": 0.004942990623131122, - "acc_norm": 0.5683130850428202, - "acc_norm_stderr": 0.004942990623131129 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5382794001578532, - "acc_stderr": 0.014011242594964115 - }, - "storycloze_2016": { - "acc": 0.689470871191876, - "acc_stderr": 0.010700112173178448 - }, - "boolq": { - "acc": 0.5761467889908257, - "acc_stderr": 0.008643046537505769 - }, - "arc_easy": { - "acc": 0.5921717171717171, - "acc_stderr": 0.01008395024004121, - "acc_norm": 0.5686026936026936, - "acc_norm_stderr": 0.010162752847747506 - }, - "arc_challenge": { - "acc": 0.2636518771331058, - "acc_stderr": 0.012875929151297053, - "acc_norm": 0.2935153583617747, - "acc_norm_stderr": 0.013307250444941118 - }, - "sciq": { - "acc": 0.879, - "acc_stderr": 0.010318210380946087, - "acc_norm": 0.882, - "acc_norm_stderr": 0.010206869264381791 - }, - "piqa": { - "acc": 0.7285092491838956, - "acc_stderr": 0.010376251176596135, - "acc_norm": 0.7377584330794341, - "acc_norm_stderr": 0.01026250256517244 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_0.csv b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..d0aa78d40d3318d9a3f0d284db90fad766c4a267 --- /dev/null +++ b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.309,0.014619600977206491,0 +anli_r2,acc,0.342,0.015008706182121726,0 +anli_r3,acc,0.3383333333333333,0.01366414400661827,0 +arc_challenge,acc,0.24061433447098976,0.012491468532390578,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975578,0 +arc_easy,acc,0.5627104377104377,0.010178768429321595,0 +arc_easy,acc_norm,0.4941077441077441,0.010259071083844227,0 +boolq,acc,0.6009174311926605,0.008565077958836785,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.2706349206349206,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.43616809400517825,0.00494895251951752,0 +hellaswag,acc_norm,0.5594503087034455,0.004954384702021661,0 +piqa,acc,0.7383025027203483,0.010255630772708229,0 +piqa,acc_norm,0.7410228509249184,0.010220966031405626,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.812,0.012361586015103761,0 +sciq,acc_norm,0.718,0.014236526215291338,0 +storycloze_2016,acc,0.6889363976483164,0.010705164869803167,0 +winogrande,acc,0.5438042620363063,0.01399845361092432,0 diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json deleted file mode 100644 index 9714735257dbc5569c054d2f0411a9d397022c09..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.309, - "acc_stderr": 0.014619600977206491 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.015008706182121726 - }, - "anli_r3": { - "acc": 0.3383333333333333, - "acc_stderr": 0.01366414400661827 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.2706349206349206 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.43616809400517825, - "acc_stderr": 0.00494895251951752, - "acc_norm": 0.5594503087034455, - "acc_norm_stderr": 0.004954384702021661 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5438042620363063, - "acc_stderr": 0.01399845361092432 - }, - "storycloze_2016": { - "acc": 0.6889363976483164, - "acc_stderr": 0.010705164869803167 - }, - "boolq": { - "acc": 0.6009174311926605, - "acc_stderr": 0.008565077958836785 - }, - "arc_easy": { - "acc": 0.5627104377104377, - "acc_stderr": 0.010178768429321595, - "acc_norm": 0.4941077441077441, - "acc_norm_stderr": 0.010259071083844227 - }, - "arc_challenge": { - "acc": 0.24061433447098976, - "acc_stderr": 0.012491468532390578, - "acc_norm": 0.28071672354948807, - "acc_norm_stderr": 0.013131238126975578 - }, - "sciq": { - "acc": 0.812, - "acc_stderr": 0.012361586015103761, - "acc_norm": 0.718, - "acc_norm_stderr": 0.014236526215291338 - }, - "piqa": { - "acc": 0.7383025027203483, - "acc_stderr": 0.010255630772708229, - "acc_norm": 0.7410228509249184, - "acc_norm_stderr": 0.010220966031405626 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_1.csv b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..4ab0127869a44db2976fbb58a43ec99e0a8f5d92 --- /dev/null +++ b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.321,0.014770821817934645,0 +anli_r2,acc,0.317,0.01472167543888022,0 +anli_r3,acc,0.34833333333333333,0.013759437498874072,0 +arc_challenge,acc,0.25853242320819114,0.012794553754288698,0 +arc_challenge,acc_norm,0.2781569965870307,0.013094469919538809,0 +arc_easy,acc,0.5728114478114478,0.010150415974210871,0 +arc_easy,acc_norm,0.5294612794612794,0.01024195772840968,0 +boolq,acc,0.6079510703363914,0.008538802914911997,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.3742418376564718,,1 +copa,acc,0.72,0.045126085985421276,0 +hellaswag,acc,0.4344752041426011,0.0049467486082713456,0 +hellaswag,acc_norm,0.5617406891057558,0.004951594063272048,0 +piqa,acc,0.73449401523395,0.010303308653024429,0 +piqa,acc_norm,0.7279651795429815,0.01038276378624739,0 +rte,acc,0.5667870036101083,0.029826764082138284,0 +sciq,acc,0.862,0.010912152632504397,0 +sciq,acc_norm,0.823,0.012075463420375061,0 +storycloze_2016,acc,0.6787814003206841,0.010798029402794914,0 +winogrande,acc,0.5548539857932123,0.013967662954355484,0 diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index 0184acd9aea92710aa4233618d287fe9775dce09..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.321, - "acc_stderr": 0.014770821817934645 - }, - "anli_r2": { - "acc": 0.317, - "acc_stderr": 0.01472167543888022 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874072 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.3742418376564718 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.045126085985421276 - }, - "hellaswag": { - "acc": 0.4344752041426011, - "acc_stderr": 0.0049467486082713456, - "acc_norm": 0.5617406891057558, - "acc_norm_stderr": 0.004951594063272048 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.029826764082138284 - }, - "winogrande": { - "acc": 0.5548539857932123, - "acc_stderr": 0.013967662954355484 - }, - "storycloze_2016": { - "acc": 0.6787814003206841, - "acc_stderr": 0.010798029402794914 - }, - "boolq": { - "acc": 0.6079510703363914, - "acc_stderr": 0.008538802914911997 - }, - "arc_easy": { - "acc": 0.5728114478114478, - "acc_stderr": 0.010150415974210871, - "acc_norm": 0.5294612794612794, - "acc_norm_stderr": 0.01024195772840968 - }, - "arc_challenge": { - "acc": 0.25853242320819114, - "acc_stderr": 0.012794553754288698, - "acc_norm": 0.2781569965870307, - "acc_norm_stderr": 0.013094469919538809 - }, - "sciq": { - "acc": 0.862, - "acc_stderr": 0.010912152632504397, - "acc_norm": 0.823, - "acc_norm_stderr": 0.012075463420375061 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024429, - "acc_norm": 0.7279651795429815, - "acc_norm_stderr": 0.01038276378624739 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_2.csv b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..028b8182fa1285e17d4068c9b0fb885177d603ff --- /dev/null +++ b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.302,0.01452608023545955,0 +anli_r2,acc,0.314,0.014683991951087973,0 +anli_r3,acc,0.3383333333333333,0.013664144006618271,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378606,0 +arc_challenge,acc_norm,0.28668941979522183,0.01321498632927477,0 +arc_easy,acc,0.5833333333333334,0.010116282977781247,0 +arc_easy,acc_norm,0.5555555555555556,0.010196254838691682,0 +boolq,acc,0.6024464831804281,0.008559523256936822,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.298811544991511,,1 +copa,acc,0.72,0.04512608598542128,0 +hellaswag,acc,0.4313881696873133,0.004942578520987353,0 +hellaswag,acc_norm,0.560246962756423,0.00495342618606983,0 +piqa,acc,0.7377584330794341,0.010262502565172445,0 +piqa,acc_norm,0.735038084874864,0.010296557993316042,0 +rte,acc,0.5487364620938628,0.029953149241808943,0 +sciq,acc,0.869,0.010674874844837954,0 +sciq,acc_norm,0.846,0.0114199130650987,0 +storycloze_2016,acc,0.6851950828433993,0.010740068943171381,0 +winogrande,acc,0.5485398579321231,0.013986110301017762,0 diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json deleted file mode 100644 index ccbf890a81687c0789eb2621e7731dec96970b74..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.302, - "acc_stderr": 0.01452608023545955 - }, - "anli_r2": { - "acc": 0.314, - "acc_stderr": 0.014683991951087973 - }, - "anli_r3": { - "acc": 0.3383333333333333, - "acc_stderr": 0.013664144006618271 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.298811544991511 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542128 - }, - "hellaswag": { - "acc": 0.4313881696873133, - "acc_stderr": 0.004942578520987353, - "acc_norm": 0.560246962756423, - "acc_norm_stderr": 0.00495342618606983 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808943 - }, - "winogrande": { - "acc": 0.5485398579321231, - "acc_stderr": 0.013986110301017762 - }, - "storycloze_2016": { - "acc": 0.6851950828433993, - "acc_stderr": 0.010740068943171381 - }, - "boolq": { - "acc": 0.6024464831804281, - "acc_stderr": 0.008559523256936822 - }, - "arc_easy": { - "acc": 0.5833333333333334, - "acc_stderr": 0.010116282977781247, - "acc_norm": 0.5555555555555556, - "acc_norm_stderr": 0.010196254838691682 - }, - "arc_challenge": { - "acc": 0.25341296928327645, - "acc_stderr": 0.012710896778378606, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.01321498632927477 - }, - "sciq": { - "acc": 0.869, - "acc_stderr": 0.010674874844837954, - "acc_norm": 0.846, - "acc_norm_stderr": 0.0114199130650987 - }, - "piqa": { - "acc": 0.7377584330794341, - "acc_stderr": 0.010262502565172445, - "acc_norm": 0.735038084874864, - "acc_norm_stderr": 0.010296557993316042 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_3.csv b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..eca75dbd459e0b965451762c4aef553d4825fc40 --- /dev/null +++ b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057137,0 +anli_r2,acc,0.346,0.015050266127564448,0 +anli_r3,acc,0.3408333333333333,0.01368860079329694,0 +arc_challenge,acc,0.26023890784982934,0.012821930225112573,0 +arc_challenge,acc_norm,0.2858361774744027,0.013203196088537369,0 +arc_easy,acc,0.5854377104377104,0.010108889212447772,0 +arc_easy,acc_norm,0.5660774410774411,0.010169795770462101,0 +boolq,acc,0.6024464831804281,0.008559523256936824,1 +cb,acc,0.44642857142857145,0.067031892279424,1 +cb,f1,0.41171827565270186,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4309898426608245,0.00494202620027959,0 +hellaswag,acc_norm,0.5628360884285999,0.004950221546187576,0 +piqa,acc,0.7421109902067464,0.01020695666205625,0 +piqa,acc_norm,0.735038084874864,0.010296557993316038,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.874,0.010499249222408047,0 +sciq,acc_norm,0.848,0.011358918303475284,0 +storycloze_2016,acc,0.6846606092998396,0.010744989116260668,0 +winogrande,acc,0.5572217837411207,0.013960157350784964,0 diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index b1ebf253dc3e90f9368d1e510d60e8cc648ada6c..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057137 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564448 - }, - "anli_r3": { - "acc": 0.3408333333333333, - "acc_stderr": 0.01368860079329694 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.067031892279424, - "f1": 0.41171827565270186 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4309898426608245, - "acc_stderr": 0.00494202620027959, - "acc_norm": 0.5628360884285999, - "acc_norm_stderr": 0.004950221546187576 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5572217837411207, - "acc_stderr": 0.013960157350784964 - }, - "storycloze_2016": { - "acc": 0.6846606092998396, - "acc_stderr": 0.010744989116260668 - }, - "boolq": { - "acc": 0.6024464831804281, - "acc_stderr": 0.008559523256936824 - }, - "arc_easy": { - "acc": 0.5854377104377104, - "acc_stderr": 0.010108889212447772, - "acc_norm": 0.5660774410774411, - "acc_norm_stderr": 0.010169795770462101 - }, - "arc_challenge": { - "acc": 0.26023890784982934, - "acc_stderr": 0.012821930225112573, - "acc_norm": 0.2858361774744027, - "acc_norm_stderr": 0.013203196088537369 - }, - "sciq": { - "acc": 0.874, - "acc_stderr": 0.010499249222408047, - "acc_norm": 0.848, - "acc_norm_stderr": 0.011358918303475284 - }, - "piqa": { - "acc": 0.7421109902067464, - "acc_stderr": 0.01020695666205625, - "acc_norm": 0.735038084874864, - "acc_norm_stderr": 0.010296557993316038 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_4.csv b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..414e0f6a0c18787c8327ed62da728581b3b9d4f9 --- /dev/null +++ b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.306,0.014580006055436967,0 +anli_r2,acc,0.357,0.015158521721486774,0 +anli_r3,acc,0.35333333333333333,0.01380457216231493,0 +arc_challenge,acc,0.26109215017064846,0.01283552390947384,0 +arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0 +arc_easy,acc,0.5858585858585859,0.010107387673002524,0 +arc_easy,acc_norm,0.5736531986531986,0.010147858603835146,0 +boolq,acc,0.5944954128440367,0.008587459055441612,1 +cb,acc,0.5714285714285714,0.06672848092813058,1 +cb,f1,0.42325083704394045,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.42949611631149176,0.004939925958728875,0 +hellaswag,acc_norm,0.5642302330213105,0.004948439229523908,0 +piqa,acc,0.7410228509249184,0.010220966031405607,0 +piqa,acc_norm,0.733949945593036,0.01031003926335282,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.882,0.010206869264381791,0 +sciq,acc_norm,0.858,0.011043457699378225,0 +storycloze_2016,acc,0.686798503474078,0.010725209422929396,0 +winogrande,acc,0.5603788476716653,0.013949649776015701,0 diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index 3bbca7d0214649e7b5bfe9e6bb4225d03fe54b3c..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.306, - "acc_stderr": 0.014580006055436967 - }, - "anli_r2": { - "acc": 0.357, - "acc_stderr": 0.015158521721486774 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.01380457216231493 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.42325083704394045 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.42949611631149176, - "acc_stderr": 0.004939925958728875, - "acc_norm": 0.5642302330213105, - "acc_norm_stderr": 0.004948439229523908 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.5603788476716653, - "acc_stderr": 0.013949649776015701 - }, - "storycloze_2016": { - "acc": 0.686798503474078, - "acc_stderr": 0.010725209422929396 - }, - "boolq": { - "acc": 0.5944954128440367, - "acc_stderr": 0.008587459055441612 - }, - "arc_easy": { - "acc": 0.5858585858585859, - "acc_stderr": 0.010107387673002524, - "acc_norm": 0.5736531986531986, - "acc_norm_stderr": 0.010147858603835146 - }, - "arc_challenge": { - "acc": 0.26109215017064846, - "acc_stderr": 0.01283552390947384, - "acc_norm": 0.28924914675767915, - "acc_norm_stderr": 0.013250012579393443 - }, - "sciq": { - "acc": 0.882, - "acc_stderr": 0.010206869264381791, - "acc_norm": 0.858, - "acc_norm_stderr": 0.011043457699378225 - }, - "piqa": { - "acc": 0.7410228509249184, - "acc_stderr": 0.010220966031405607, - "acc_norm": 0.733949945593036, - "acc_norm_stderr": 0.01031003926335282 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_5.csv b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..42bc8294b23332f73eb60a1dbe0c23d83692addc --- /dev/null +++ b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.311,0.014645596385722695,0 +anli_r2,acc,0.347,0.015060472031706622,0 +anli_r3,acc,0.36083333333333334,0.013869180252444865,0 +arc_challenge,acc,0.26621160409556316,0.012915774781523198,0 +arc_challenge,acc_norm,0.29948805460750855,0.01338502163731357,0 +arc_easy,acc,0.5854377104377104,0.01010888921244778,0 +arc_easy,acc_norm,0.563973063973064,0.010175459582759738,0 +boolq,acc,0.6018348623853211,0.008561755594317447,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.3462184873949579,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.43089026090420235,0.004941887610849048,0 +hellaswag,acc_norm,0.5655247958573989,0.004946748608271347,0 +piqa,acc,0.735038084874864,0.010296557993316045,0 +piqa,acc_norm,0.7388465723612623,0.010248738649935593,0 +rte,acc,0.5595667870036101,0.029882123363118726,0 +sciq,acc,0.881,0.010244215145336664,0 +sciq,acc_norm,0.869,0.010674874844837957,0 +storycloze_2016,acc,0.6819882415820417,0.010769343495248539,0 +winogrande,acc,0.531965272296764,0.014023739221166382,0 diff --git a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index 9531c27141591a7c276a1f64bb353d5b5d318c9d..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed2/evaluation/rankeval/2b855b14bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.311, - "acc_stderr": 0.014645596385722695 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706622 - }, - "anli_r3": { - "acc": 0.36083333333333334, - "acc_stderr": 0.013869180252444865 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.3462184873949579 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.43089026090420235, - "acc_stderr": 0.004941887610849048, - "acc_norm": 0.5655247958573989, - "acc_norm_stderr": 0.004946748608271347 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118726 - }, - "winogrande": { - "acc": 0.531965272296764, - "acc_stderr": 0.014023739221166382 - }, - "storycloze_2016": { - "acc": 0.6819882415820417, - "acc_stderr": 0.010769343495248539 - }, - "boolq": { - "acc": 0.6018348623853211, - "acc_stderr": 0.008561755594317447 - }, - "arc_easy": { - "acc": 0.5854377104377104, - "acc_stderr": 0.01010888921244778, - "acc_norm": 0.563973063973064, - "acc_norm_stderr": 0.010175459582759738 - }, - "arc_challenge": { - "acc": 0.26621160409556316, - "acc_stderr": 0.012915774781523198, - "acc_norm": 0.29948805460750855, - "acc_norm_stderr": 0.01338502163731357 - }, - "sciq": { - "acc": 0.881, - "acc_stderr": 0.010244215145336664, - "acc_norm": 0.869, - "acc_norm_stderr": 0.010674874844837957 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316045, - "acc_norm": 0.7388465723612623, - "acc_norm_stderr": 0.010248738649935593 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_0.csv b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..624b7577e9babc205d06506f0e80f474a54c86ea --- /dev/null +++ b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.338,0.01496596071022448,0 +anli_r2,acc,0.334,0.014922019523732965,0 +anli_r3,acc,0.3325,0.01360541734571053,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.2790102389078498,0.013106784883601336,0 +arc_easy,acc,0.5572390572390572,0.010192333348394457,0 +arc_easy,acc_norm,0.49158249158249157,0.010258329515226447,0 +boolq,acc,0.5966360856269113,0.008580168554889729,1 +cb,acc,0.4107142857142857,0.06633634150359538,1 +cb,f1,0.2283333333333333,,1 +copa,acc,0.77,0.042295258468165065,0 +hellaswag,acc,0.4367655845449114,0.004949716368890496,0 +hellaswag,acc_norm,0.5620394343756224,0.0049512221717631175,0 +piqa,acc,0.7388465723612623,0.010248738649935576,0 +piqa,acc_norm,0.7486398258977149,0.010121156016819247,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.814,0.012310790208412805,0 +sciq,acc_norm,0.731,0.014029819522568196,0 +storycloze_2016,acc,0.689470871191876,0.010700112173178448,0 +winogrande,acc,0.5382794001578532,0.014011242594964118,0 diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index d52091f53e53d10a5ce8be6b989794e51c19b5e4..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.01496596071022448 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732965 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.01360541734571053 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359538, - "f1": 0.2283333333333333 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.042295258468165065 - }, - "hellaswag": { - "acc": 0.4367655845449114, - "acc_stderr": 0.004949716368890496, - "acc_norm": 0.5620394343756224, - "acc_norm_stderr": 0.0049512221717631175 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5382794001578532, - "acc_stderr": 0.014011242594964118 - }, - "storycloze_2016": { - "acc": 0.689470871191876, - "acc_stderr": 0.010700112173178448 - }, - "boolq": { - "acc": 0.5966360856269113, - "acc_stderr": 0.008580168554889729 - }, - "arc_easy": { - "acc": 0.5572390572390572, - "acc_stderr": 0.010192333348394457, - "acc_norm": 0.49158249158249157, - "acc_norm_stderr": 0.010258329515226447 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.2790102389078498, - "acc_norm_stderr": 0.013106784883601336 - }, - "sciq": { - "acc": 0.814, - "acc_stderr": 0.012310790208412805, - "acc_norm": 0.731, - "acc_norm_stderr": 0.014029819522568196 - }, - "piqa": { - "acc": 0.7388465723612623, - "acc_stderr": 0.010248738649935576, - "acc_norm": 0.7486398258977149, - "acc_norm_stderr": 0.010121156016819247 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_1.csv b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..4a5a9736a8cefeb391758da98a89006c521e85a7 --- /dev/null +++ b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229859,0 +anli_r2,acc,0.338,0.01496596071022448,0 +anli_r3,acc,0.3416666666666667,0.01369665877800251,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207706,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0 +arc_easy,acc,0.5728114478114478,0.010150415974210866,0 +arc_easy,acc_norm,0.5378787878787878,0.010230299628864806,0 +boolq,acc,0.6033639143730887,0.008556148582031997,1 +cb,acc,0.48214285714285715,0.06737697508644648,1 +cb,f1,0.3427741466957153,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.4331806413065126,0.0049450236570322765,0 +hellaswag,acc_norm,0.5603465445130452,0.004953305461311743,0 +piqa,acc,0.7421109902067464,0.010206956662056248,0 +piqa,acc_norm,0.7486398258977149,0.010121156016819245,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.872,0.010570133761108658,0 +sciq,acc_norm,0.847,0.011389500459665533,0 +storycloze_2016,acc,0.6819882415820417,0.010769343495248546,0 +winogrande,acc,0.5619573796369376,0.013944181296470801,0 diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json deleted file mode 100644 index 0f0ce78c0a98c9d5ee8db8cf01f4e11281917f64..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229859 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.01496596071022448 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.01369665877800251 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.06737697508644648, - "f1": 0.3427741466957153 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.4331806413065126, - "acc_stderr": 0.0049450236570322765, - "acc_norm": 0.5603465445130452, - "acc_norm_stderr": 0.004953305461311743 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5619573796369376, - "acc_stderr": 0.013944181296470801 - }, - "storycloze_2016": { - "acc": 0.6819882415820417, - "acc_stderr": 0.010769343495248546 - }, - "boolq": { - "acc": 0.6033639143730887, - "acc_stderr": 0.008556148582031997 - }, - "arc_easy": { - "acc": 0.5728114478114478, - "acc_stderr": 0.010150415974210866, - "acc_norm": 0.5378787878787878, - "acc_norm_stderr": 0.010230299628864806 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207706, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097222 - }, - "sciq": { - "acc": 0.872, - "acc_stderr": 0.010570133761108658, - "acc_norm": 0.847, - "acc_norm_stderr": 0.011389500459665533 - }, - "piqa": { - "acc": 0.7421109902067464, - "acc_stderr": 0.010206956662056248, - "acc_norm": 0.7486398258977149, - "acc_norm_stderr": 0.010121156016819245 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_2.csv b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..4e97bcb4154d6f8844c4bd154132846359ddce2b --- /dev/null +++ b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.333,0.014910846164229857,0 +anli_r3,acc,0.3283333333333333,0.013562032919529015,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858115,0 +arc_challenge,acc_norm,0.28754266211604096,0.013226719056266129,0 +arc_easy,acc,0.5820707070707071,0.010120628211017888,0 +arc_easy,acc_norm,0.5568181818181818,0.010193324837773493,0 +boolq,acc,0.6134556574923548,0.008516943934341973,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.25430680885972107,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.4291973710416252,0.0049395004048821845,0 +hellaswag,acc_norm,0.5608444532961562,0.004952698802275639,0 +piqa,acc,0.7404787812840044,0.010227939888173918,0 +piqa,acc_norm,0.7431991294885746,0.010192864802278033,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.881,0.01024421514533666,0 +sciq,acc_norm,0.864,0.01084535023047299,0 +storycloze_2016,acc,0.6835916622127205,0.010754780097940887,0 +winogrande,acc,0.5682715074980268,0.013920872110010708,0 diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json deleted file mode 100644 index 75432e58eaa5ddcc9f89ae89237edf7a43127756..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229857 - }, - "anli_r3": { - "acc": 0.3283333333333333, - "acc_stderr": 0.013562032919529015 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.25430680885972107 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.4291973710416252, - "acc_stderr": 0.0049395004048821845, - "acc_norm": 0.5608444532961562, - "acc_norm_stderr": 0.004952698802275639 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5682715074980268, - "acc_stderr": 0.013920872110010708 - }, - "storycloze_2016": { - "acc": 0.6835916622127205, - "acc_stderr": 0.010754780097940887 - }, - "boolq": { - "acc": 0.6134556574923548, - "acc_stderr": 0.008516943934341973 - }, - "arc_easy": { - "acc": 0.5820707070707071, - "acc_stderr": 0.010120628211017888, - "acc_norm": 0.5568181818181818, - "acc_norm_stderr": 0.010193324837773493 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858115, - "acc_norm": 0.28754266211604096, - "acc_norm_stderr": 0.013226719056266129 - }, - "sciq": { - "acc": 0.881, - "acc_stderr": 0.01024421514533666, - "acc_norm": 0.864, - "acc_norm_stderr": 0.01084535023047299 - }, - "piqa": { - "acc": 0.7404787812840044, - "acc_stderr": 0.010227939888173918, - "acc_norm": 0.7431991294885746, - "acc_norm_stderr": 0.010192864802278033 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_3.csv b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..7707583729456f1289bbd9a36c8f8ff934f5efcc --- /dev/null +++ b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928355,0 +anli_r2,acc,0.356,0.015149042659306632,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.26621160409556316,0.012915774781523209,0 +arc_challenge,acc_norm,0.2883959044368601,0.013238394422428164,0 +arc_easy,acc,0.5829124579124579,0.010117738967781977,0 +arc_easy,acc_norm,0.5732323232323232,0.010149141043955635,0 +boolq,acc,0.6186544342507645,0.008495245917063554,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.41258741258741266,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.43108942441744674,0.004942164585991471,0 +hellaswag,acc_norm,0.5658235411272655,0.004946353590937014,0 +piqa,acc,0.7459194776931447,0.01015727199913504,0 +piqa,acc_norm,0.7546245919477693,0.010039831320422386,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.88,0.010281328012747394,0 +sciq,acc_norm,0.864,0.010845350230472988,0 +storycloze_2016,acc,0.6878674505611972,0.01071522034627968,0 +winogrande,acc,0.5516969218626677,0.013977171307126343,0 diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json deleted file mode 100644 index 9155878fd36351f11012702b2ed51aa21501244a..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928355 - }, - "anli_r2": { - "acc": 0.356, - "acc_stderr": 0.015149042659306632 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.41258741258741266 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.43108942441744674, - "acc_stderr": 0.004942164585991471, - "acc_norm": 0.5658235411272655, - "acc_norm_stderr": 0.004946353590937014 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5516969218626677, - "acc_stderr": 0.013977171307126343 - }, - "storycloze_2016": { - "acc": 0.6878674505611972, - "acc_stderr": 0.01071522034627968 - }, - "boolq": { - "acc": 0.6186544342507645, - "acc_stderr": 0.008495245917063554 - }, - "arc_easy": { - "acc": 0.5829124579124579, - "acc_stderr": 0.010117738967781977, - "acc_norm": 0.5732323232323232, - "acc_norm_stderr": 0.010149141043955635 - }, - "arc_challenge": { - "acc": 0.26621160409556316, - "acc_stderr": 0.012915774781523209, - "acc_norm": 0.2883959044368601, - "acc_norm_stderr": 0.013238394422428164 - }, - "sciq": { - "acc": 0.88, - "acc_stderr": 0.010281328012747394, - "acc_norm": 0.864, - "acc_norm_stderr": 0.010845350230472988 - }, - "piqa": { - "acc": 0.7459194776931447, - "acc_stderr": 0.01015727199913504, - "acc_norm": 0.7546245919477693, - "acc_norm_stderr": 0.010039831320422386 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_4.csv b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..6c172116824faf519c88d87d64ee93096028692e --- /dev/null +++ b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.348,0.015070604603768408,0 +anli_r2,acc,0.35,0.015090650341444231,0 +anli_r3,acc,0.34833333333333333,0.013759437498874063,0 +arc_challenge,acc,0.25426621160409557,0.012724999945157743,0 +arc_challenge,acc_norm,0.2858361774744027,0.013203196088537369,0 +arc_easy,acc,0.5909090909090909,0.010088775152615784,0 +arc_easy,acc_norm,0.5622895622895623,0.010179856486006908,0 +boolq,acc,0.6030581039755352,0.008557276964675129,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3707043913940466,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.43238398725353516,0.00494394506961146,0 +hellaswag,acc_norm,0.5658235411272655,0.004946353590937013,0 +piqa,acc,0.7421109902067464,0.01020695666205626,0 +piqa,acc_norm,0.750272034820457,0.01009923296986747,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.891,0.00985982840703719,0 +sciq,acc_norm,0.866,0.010777762298369685,0 +storycloze_2016,acc,0.6889363976483164,0.010705164869803167,0 +winogrande,acc,0.5580110497237569,0.013957584079109008,0 diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json deleted file mode 100644 index c4c98dff012d2a1485c7ef9dc22301542a4fd590..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.348, - "acc_stderr": 0.015070604603768408 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444231 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874063 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3707043913940466 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.43238398725353516, - "acc_stderr": 0.00494394506961146, - "acc_norm": 0.5658235411272655, - "acc_norm_stderr": 0.004946353590937013 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.5580110497237569, - "acc_stderr": 0.013957584079109008 - }, - "storycloze_2016": { - "acc": 0.6889363976483164, - "acc_stderr": 0.010705164869803167 - }, - "boolq": { - "acc": 0.6030581039755352, - "acc_stderr": 0.008557276964675129 - }, - "arc_easy": { - "acc": 0.5909090909090909, - "acc_stderr": 0.010088775152615784, - "acc_norm": 0.5622895622895623, - "acc_norm_stderr": 0.010179856486006908 - }, - "arc_challenge": { - "acc": 0.25426621160409557, - "acc_stderr": 0.012724999945157743, - "acc_norm": 0.2858361774744027, - "acc_norm_stderr": 0.013203196088537369 - }, - "sciq": { - "acc": 0.891, - "acc_stderr": 0.00985982840703719, - "acc_norm": 0.866, - "acc_norm_stderr": 0.010777762298369685 - }, - "piqa": { - "acc": 0.7421109902067464, - "acc_stderr": 0.01020695666205626, - "acc_norm": 0.750272034820457, - "acc_norm_stderr": 0.01009923296986747 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_5.csv b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..93731168b73542d2ad7bd64acc03b336b7745f9f --- /dev/null +++ b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732968,0 +anli_r2,acc,0.341,0.0149981313484027,0 +anli_r3,acc,0.33416666666666667,0.013622434813136774,0 +arc_challenge,acc,0.26621160409556316,0.01291577478152321,0 +arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0 +arc_easy,acc,0.5955387205387206,0.010070746648278776,0 +arc_easy,acc_norm,0.569023569023569,0.01016155286349375,0 +boolq,acc,0.6168195718654435,0.008503021391450783,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.25064350064350066,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4304919338777136,0.004941331215598549,0 +hellaswag,acc_norm,0.5675164309898426,0.004944080605048773,0 +piqa,acc,0.73449401523395,0.010303308653024427,0 +piqa,acc_norm,0.7431991294885746,0.010192864802278026,0 +rte,acc,0.5054151624548736,0.030094698123239966,0 +sciq,acc,0.893,0.009779910359847167,0 +sciq,acc_norm,0.877,0.010391293421849877,0 +storycloze_2016,acc,0.6942811330839124,0.010653884866190597,0 +winogrande,acc,0.5595895816890292,0.013952330311915622,0 diff --git a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json b/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json deleted file mode 100644 index b3166656ddb8a84a1a57c188d7b30ab04feb683f..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed3/evaluation/rankeval/2b855b14bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732968 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.0149981313484027 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136774 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.25064350064350066 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4304919338777136, - "acc_stderr": 0.004941331215598549, - "acc_norm": 0.5675164309898426, - "acc_norm_stderr": 0.004944080605048773 - }, - "rte": { - "acc": 0.5054151624548736, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5595895816890292, - "acc_stderr": 0.013952330311915622 - }, - "storycloze_2016": { - "acc": 0.6942811330839124, - "acc_stderr": 0.010653884866190597 - }, - "boolq": { - "acc": 0.6168195718654435, - "acc_stderr": 0.008503021391450783 - }, - "arc_easy": { - "acc": 0.5955387205387206, - "acc_stderr": 0.010070746648278776, - "acc_norm": 0.569023569023569, - "acc_norm_stderr": 0.01016155286349375 - }, - "arc_challenge": { - "acc": 0.26621160409556316, - "acc_stderr": 0.01291577478152321, - "acc_norm": 0.27559726962457337, - "acc_norm_stderr": 0.013057169655761838 - }, - "sciq": { - "acc": 0.893, - "acc_stderr": 0.009779910359847167, - "acc_norm": 0.877, - "acc_norm_stderr": 0.010391293421849877 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024427, - "acc_norm": 0.7431991294885746, - "acc_norm_stderr": 0.010192864802278026 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_0.csv b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..dc26177342b0b26adc1e430c7d47b03d7a461505 --- /dev/null +++ b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229857,0 +anli_r2,acc,0.34,0.014987482264363939,0 +anli_r3,acc,0.35083333333333333,0.013782212417178193,0 +arc_challenge,acc,0.24488054607508533,0.012566273985131358,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252427,0 +arc_easy,acc,0.5753367003367004,0.010142653687480414,0 +arc_easy,acc_norm,0.5046296296296297,0.01025934370588972,0 +boolq,acc,0.5853211009174312,0.008616791778981293,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.3264279758182197,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.4362676757618004,0.004949080334816025,0 +hellaswag,acc_norm,0.5585540728938458,0.004955447564694061,0 +piqa,acc,0.7464635473340587,0.010150090834551791,0 +piqa,acc_norm,0.7540805223068553,0.010047331865625177,0 +rte,acc,0.5523465703971119,0.029931070362939526,0 +sciq,acc,0.806,0.012510816141264362,0 +sciq,acc_norm,0.725,0.014127086556490528,0 +storycloze_2016,acc,0.6809192944949225,0.010778970635312489,0 +winogrande,acc,0.5580110497237569,0.01395758407910899,0 diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index 8f1195e944feaf445c1068cc14c4d830c1fb5851..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229857 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363939 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178193 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.3264279758182197 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.4362676757618004, - "acc_stderr": 0.004949080334816025, - "acc_norm": 0.5585540728938458, - "acc_norm_stderr": 0.004955447564694061 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5580110497237569, - "acc_stderr": 0.01395758407910899 - }, - "storycloze_2016": { - "acc": 0.6809192944949225, - "acc_stderr": 0.010778970635312489 - }, - "boolq": { - "acc": 0.5853211009174312, - "acc_stderr": 0.008616791778981293 - }, - "arc_easy": { - "acc": 0.5753367003367004, - "acc_stderr": 0.010142653687480414, - "acc_norm": 0.5046296296296297, - "acc_norm_stderr": 0.01025934370588972 - }, - "arc_challenge": { - "acc": 0.24488054607508533, - "acc_stderr": 0.012566273985131358, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252427 - }, - "sciq": { - "acc": 0.806, - "acc_stderr": 0.012510816141264362, - "acc_norm": 0.725, - "acc_norm_stderr": 0.014127086556490528 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551791, - "acc_norm": 0.7540805223068553, - "acc_norm_stderr": 0.010047331865625177 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_1.csv b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..23be3c0cd09e2be22185b2af8f09c8fa286232ed --- /dev/null +++ b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.314,0.014683991951087966,0 +anli_r2,acc,0.331,0.014888272588203936,0 +anli_r3,acc,0.36666666666666664,0.013916893275819934,0 +arc_challenge,acc,0.24573378839590443,0.012581033453730116,0 +arc_challenge,acc_norm,0.2636518771331058,0.012875929151297054,0 +arc_easy,acc,0.5749158249158249,0.010143966195717845,0 +arc_easy,acc_norm,0.5357744107744108,0.010233488709726537,0 +boolq,acc,0.5938837920489297,0.00858951094378741,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.34745762711864403,,1 +copa,acc,0.71,0.04560480215720684,0 +hellaswag,acc,0.4313881696873133,0.00494257852098735,0 +hellaswag,acc_norm,0.5603465445130452,0.004953305461311742,0 +piqa,acc,0.7410228509249184,0.010220966031405617,0 +piqa,acc_norm,0.7410228509249184,0.010220966031405621,0 +rte,acc,0.5595667870036101,0.02988212336311872,0 +sciq,acc,0.859,0.01101091459599244,0 +sciq,acc_norm,0.832,0.011828605831454267,0 +storycloze_2016,acc,0.6739711384286478,0.010839964752045182,0 +winogrande,acc,0.5501183898973955,0.013981711904049728,0 diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index aebdec7e6858ebc9cb9bf2ea3121ec0eb59343fa..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.314, - "acc_stderr": 0.014683991951087966 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.014888272588203936 - }, - "anli_r3": { - "acc": 0.36666666666666664, - "acc_stderr": 0.013916893275819934 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.34745762711864403 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.04560480215720684 - }, - "hellaswag": { - "acc": 0.4313881696873133, - "acc_stderr": 0.00494257852098735, - "acc_norm": 0.5603465445130452, - "acc_norm_stderr": 0.004953305461311742 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.02988212336311872 - }, - "winogrande": { - "acc": 0.5501183898973955, - "acc_stderr": 0.013981711904049728 - }, - "storycloze_2016": { - "acc": 0.6739711384286478, - "acc_stderr": 0.010839964752045182 - }, - "boolq": { - "acc": 0.5938837920489297, - "acc_stderr": 0.00858951094378741 - }, - "arc_easy": { - "acc": 0.5749158249158249, - "acc_stderr": 0.010143966195717845, - "acc_norm": 0.5357744107744108, - "acc_norm_stderr": 0.010233488709726537 - }, - "arc_challenge": { - "acc": 0.24573378839590443, - "acc_stderr": 0.012581033453730116, - "acc_norm": 0.2636518771331058, - "acc_norm_stderr": 0.012875929151297054 - }, - "sciq": { - "acc": 0.859, - "acc_stderr": 0.01101091459599244, - "acc_norm": 0.832, - "acc_norm_stderr": 0.011828605831454267 - }, - "piqa": { - "acc": 0.7410228509249184, - "acc_stderr": 0.010220966031405617, - "acc_norm": 0.7410228509249184, - "acc_norm_stderr": 0.010220966031405621 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_2.csv b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..f57aeb8f8d04961df3ae318f65b1358a978d673f --- /dev/null +++ b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095527,0 +anli_r2,acc,0.345,0.015039986742055235,0 +anli_r3,acc,0.35,0.013774667009018552,0 +arc_challenge,acc,0.26109215017064846,0.012835523909473841,0 +arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0 +arc_easy,acc,0.585016835016835,0.01011038315196113,0 +arc_easy,acc_norm,0.555976430976431,0.010195285580783954,0 +boolq,acc,0.6042813455657492,0.008552742471459793,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.31010101010101004,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.43158733320055764,0.004942853459371544,0 +hellaswag,acc_norm,0.5632344154550887,0.004949716368890496,0 +piqa,acc,0.73449401523395,0.010303308653024429,0 +piqa,acc_norm,0.7393906420021763,0.010241826155811635,0 +rte,acc,0.5342960288808665,0.03002557981936643,0 +sciq,acc,0.872,0.010570133761108663,0 +sciq,acc_norm,0.849,0.011328165223341673,0 +storycloze_2016,acc,0.6873329770176376,0.010720223172953177,0 +winogrande,acc,0.5485398579321231,0.013986110301017764,0 diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index f9b4898b95302c57138117149d63a3025cd4f46c..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095527 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055235 - }, - "anli_r3": { - "acc": 0.35, - "acc_stderr": 0.013774667009018552 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.31010101010101004 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.43158733320055764, - "acc_stderr": 0.004942853459371544, - "acc_norm": 0.5632344154550887, - "acc_norm_stderr": 0.004949716368890496 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.03002557981936643 - }, - "winogrande": { - "acc": 0.5485398579321231, - "acc_stderr": 0.013986110301017764 - }, - "storycloze_2016": { - "acc": 0.6873329770176376, - "acc_stderr": 0.010720223172953177 - }, - "boolq": { - "acc": 0.6042813455657492, - "acc_stderr": 0.008552742471459793 - }, - "arc_easy": { - "acc": 0.585016835016835, - "acc_stderr": 0.01011038315196113, - "acc_norm": 0.555976430976431, - "acc_norm_stderr": 0.010195285580783954 - }, - "arc_challenge": { - "acc": 0.26109215017064846, - "acc_stderr": 0.012835523909473841, - "acc_norm": 0.2773037542662116, - "acc_norm_stderr": 0.013082095839059374 - }, - "sciq": { - "acc": 0.872, - "acc_stderr": 0.010570133761108663, - "acc_norm": 0.849, - "acc_norm_stderr": 0.011328165223341673 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024429, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.010241826155811635 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_3.csv b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..a5a7293f92b8a5335a36a28f7a1c5e80914137dc --- /dev/null +++ b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.305,0.014566646394664401,0 +anli_r2,acc,0.35,0.015090650341444231,0 +anli_r3,acc,0.3566666666666667,0.013833742805050713,0 +arc_challenge,acc,0.26109215017064846,0.012835523909473847,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0 +arc_easy,acc,0.5904882154882155,0.010090368160990059,0 +arc_easy,acc_norm,0.5627104377104377,0.01017876842932159,0 +boolq,acc,0.6033639143730887,0.008556148582031999,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.4254385964912281,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.42949611631149176,0.0049399259587288745,0 +hellaswag,acc_norm,0.5625373431587333,0.004950598300667551,0 +piqa,acc,0.7459194776931447,0.01015727199913504,0 +piqa,acc_norm,0.7459194776931447,0.010157271999135046,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.88,0.010281328012747398,0 +sciq,acc_norm,0.853,0.011203415395160336,0 +storycloze_2016,acc,0.686798503474078,0.010725209422929403,0 +winogrande,acc,0.5603788476716653,0.013949649776015705,0 diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json deleted file mode 100644 index ea3ef45cfd3cca0a36af6c88c35f59a78138fdab..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.305, - "acc_stderr": 0.014566646394664401 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444231 - }, - "anli_r3": { - "acc": 0.3566666666666667, - "acc_stderr": 0.013833742805050713 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.4254385964912281 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.42949611631149176, - "acc_stderr": 0.0049399259587288745, - "acc_norm": 0.5625373431587333, - "acc_norm_stderr": 0.004950598300667551 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.5603788476716653, - "acc_stderr": 0.013949649776015705 - }, - "storycloze_2016": { - "acc": 0.686798503474078, - "acc_stderr": 0.010725209422929403 - }, - "boolq": { - "acc": 0.6033639143730887, - "acc_stderr": 0.008556148582031999 - }, - "arc_easy": { - "acc": 0.5904882154882155, - "acc_stderr": 0.010090368160990059, - "acc_norm": 0.5627104377104377, - "acc_norm_stderr": 0.01017876842932159 - }, - "arc_challenge": { - "acc": 0.26109215017064846, - "acc_stderr": 0.012835523909473847, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097222 - }, - "sciq": { - "acc": 0.88, - "acc_stderr": 0.010281328012747398, - "acc_norm": 0.853, - "acc_norm_stderr": 0.011203415395160336 - }, - "piqa": { - "acc": 0.7459194776931447, - "acc_stderr": 0.01015727199913504, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.010157271999135046 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_4.csv b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..809a0181e815f64038e6e27dd68a2de1773ce346 --- /dev/null +++ b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.337,0.014955087918653603,0 +anli_r2,acc,0.335,0.01493311749093257,0 +anli_r3,acc,0.3516666666666667,0.013789711695404794,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858112,0 +arc_challenge,acc_norm,0.2909556313993174,0.013273077865907581,0 +arc_easy,acc,0.5854377104377104,0.010108889212447774,0 +arc_easy,acc_norm,0.5635521885521886,0.010176569980111044,0 +boolq,acc,0.6082568807339449,0.008537618477478612,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.30671296296296297,,1 +copa,acc,0.74,0.04408440022768079,0 +hellaswag,acc,0.42959569806811393,0.004940067402031045,0 +hellaswag,acc_norm,0.5638319059948218,0.004948952519517511,0 +piqa,acc,0.7415669205658324,0.01021397163677332,0 +piqa,acc_norm,0.7437431991294886,0.010185787831565056,0 +rte,acc,0.5306859205776173,0.030039730592197816,0 +sciq,acc,0.872,0.01057013376110866,0 +sciq,acc_norm,0.867,0.010743669132397337,0 +storycloze_2016,acc,0.6862640299305185,0.010730179119317623,0 +winogrande,acc,0.5524861878453039,0.013974847640536197,0 diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index 4aef79f90038945dc41f5355a85c448445b97943..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.337, - "acc_stderr": 0.014955087918653603 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.01493311749093257 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404794 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.30671296296296297 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768079 - }, - "hellaswag": { - "acc": 0.42959569806811393, - "acc_stderr": 0.004940067402031045, - "acc_norm": 0.5638319059948218, - "acc_norm_stderr": 0.004948952519517511 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197816 - }, - "winogrande": { - "acc": 0.5524861878453039, - "acc_stderr": 0.013974847640536197 - }, - "storycloze_2016": { - "acc": 0.6862640299305185, - "acc_stderr": 0.010730179119317623 - }, - "boolq": { - "acc": 0.6082568807339449, - "acc_stderr": 0.008537618477478612 - }, - "arc_easy": { - "acc": 0.5854377104377104, - "acc_stderr": 0.010108889212447774, - "acc_norm": 0.5635521885521886, - "acc_norm_stderr": 0.010176569980111044 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858112, - "acc_norm": 0.2909556313993174, - "acc_norm_stderr": 0.013273077865907581 - }, - "sciq": { - "acc": 0.872, - "acc_stderr": 0.01057013376110866, - "acc_norm": 0.867, - "acc_norm_stderr": 0.010743669132397337 - }, - "piqa": { - "acc": 0.7415669205658324, - "acc_stderr": 0.01021397163677332, - "acc_norm": 0.7437431991294886, - "acc_norm_stderr": 0.010185787831565056 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_5.csv b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..8d8c57092bfd1f216ccac783c188e66eb26111d7 --- /dev/null +++ b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.354,0.015129868238451772,0 +anli_r2,acc,0.35,0.015090650341444231,0 +anli_r3,acc,0.3383333333333333,0.013664144006618268,0 +arc_challenge,acc,0.26023890784982934,0.012821930225112568,0 +arc_challenge,acc_norm,0.2713310580204778,0.012993807727545789,0 +arc_easy,acc,0.5900673400673401,0.010091953527506246,0 +arc_easy,acc_norm,0.5694444444444444,0.01016034539686008,0 +boolq,acc,0.6155963302752293,0.008508133844703914,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.400068271036013,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.43218482374029077,0.00494367338827627,0 +hellaswag,acc_norm,0.5656243776140211,0.004946617138983512,0 +piqa,acc,0.7393906420021763,0.010241826155811625,0 +piqa,acc_norm,0.7421109902067464,0.010206956662056241,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.889,0.009938701010583726,0 +sciq,acc_norm,0.876,0.01042749887234396,0 +storycloze_2016,acc,0.692143238909674,0.010674598158758188,0 +winogrande,acc,0.5548539857932123,0.013967662954355486,0 diff --git a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index fa1db31cf59e8ec05d48f2e7664c2e4b2d1629a4..0000000000000000000000000000000000000000 --- a/2b855b14bc4seed4/evaluation/rankeval/2b855b14bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.354, - "acc_stderr": 0.015129868238451772 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444231 - }, - "anli_r3": { - "acc": 0.3383333333333333, - "acc_stderr": 0.013664144006618268 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.400068271036013 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.43218482374029077, - "acc_stderr": 0.00494367338827627, - "acc_norm": 0.5656243776140211, - "acc_norm_stderr": 0.004946617138983512 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5548539857932123, - "acc_stderr": 0.013967662954355486 - }, - "storycloze_2016": { - "acc": 0.692143238909674, - "acc_stderr": 0.010674598158758188 - }, - "boolq": { - "acc": 0.6155963302752293, - "acc_stderr": 0.008508133844703914 - }, - "arc_easy": { - "acc": 0.5900673400673401, - "acc_stderr": 0.010091953527506246, - "acc_norm": 0.5694444444444444, - "acc_norm_stderr": 0.01016034539686008 - }, - "arc_challenge": { - "acc": 0.26023890784982934, - "acc_stderr": 0.012821930225112568, - "acc_norm": 0.2713310580204778, - "acc_norm_stderr": 0.012993807727545789 - }, - "sciq": { - "acc": 0.889, - "acc_stderr": 0.009938701010583726, - "acc_norm": 0.876, - "acc_norm_stderr": 0.01042749887234396 - }, - "piqa": { - "acc": 0.7393906420021763, - "acc_stderr": 0.010241826155811625, - "acc_norm": 0.7421109902067464, - "acc_norm_stderr": 0.010206956662056241 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_0.csv b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..f32e2eb696edd86fdd1cb64f931ba96599e546d8 --- /dev/null +++ b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795023,0 +anli_r2,acc,0.334,0.014922019523732963,0 +anli_r3,acc,0.34,0.013680495725767787,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.27303754266211605,0.01301933276263575,0 +arc_easy,acc,0.5627104377104377,0.010178768429321602,0 +arc_easy,acc_norm,0.5088383838383839,0.010258180468004831,0 +boolq,acc,0.6122324159021407,0.0085219003280139,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.30810810810810807,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4366660027882892,0.0049495895676788925,0 +hellaswag,acc_norm,0.5616411073491336,0.00495171762200798,0 +piqa,acc,0.7437431991294886,0.010185787831565062,0 +piqa,acc_norm,0.7459194776931447,0.010157271999135053,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.799,0.012679107214617324,0 +sciq,acc_norm,0.735,0.013963164754809953,0 +storycloze_2016,acc,0.6916087653661144,0.0106797344454878,0 +winogrande,acc,0.5611681136543015,0.013946933444507034,0 diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index cc335b1ae8e439f008a480e5a15f6f9520be0e32..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795023 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767787 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.30810810810810807 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4366660027882892, - "acc_stderr": 0.0049495895676788925, - "acc_norm": 0.5616411073491336, - "acc_norm_stderr": 0.00495171762200798 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5611681136543015, - "acc_stderr": 0.013946933444507034 - }, - "storycloze_2016": { - "acc": 0.6916087653661144, - "acc_stderr": 0.0106797344454878 - }, - "boolq": { - "acc": 0.6122324159021407, - "acc_stderr": 0.0085219003280139 - }, - "arc_easy": { - "acc": 0.5627104377104377, - "acc_stderr": 0.010178768429321602, - "acc_norm": 0.5088383838383839, - "acc_norm_stderr": 0.010258180468004831 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.27303754266211605, - "acc_norm_stderr": 0.01301933276263575 - }, - "sciq": { - "acc": 0.799, - "acc_stderr": 0.012679107214617324, - "acc_norm": 0.735, - "acc_norm_stderr": 0.013963164754809953 - }, - "piqa": { - "acc": 0.7437431991294886, - "acc_stderr": 0.010185787831565062, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.010157271999135053 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_1.csv b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..dfe88ae58727c7e655825a2af2b76870be60dbf2 --- /dev/null +++ b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.343,0.015019206922356951,0 +anli_r2,acc,0.331,0.014888272588203938,0 +anli_r3,acc,0.35083333333333333,0.013782212417178197,0 +arc_challenge,acc,0.26109215017064846,0.012835523909473841,0 +arc_challenge,acc_norm,0.28668941979522183,0.013214986329274783,0 +arc_easy,acc,0.5812289562289562,0.010123487160167808,0 +arc_easy,acc_norm,0.5437710437710438,0.010220394383722024,0 +boolq,acc,0.5712538226299694,0.008655800332760226,1 +cb,acc,0.44642857142857145,0.06703189227942397,1 +cb,f1,0.3134878193701723,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.4342760406293567,0.0049464854665446254,0 +hellaswag,acc_norm,0.5609440350527783,0.0049525768633152155,0 +piqa,acc,0.735582154515778,0.010289787244767172,0 +piqa,acc_norm,0.7301414581066377,0.010356595421852195,0 +rte,acc,0.5523465703971119,0.029931070362939526,0 +sciq,acc,0.871,0.010605256784796563,0 +sciq,acc_norm,0.836,0.011715000693181325,0 +storycloze_2016,acc,0.6835916622127205,0.010754780097940887,0 +winogrande,acc,0.5666929755327546,0.013926915052757347,0 diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index c8d1949e998d44959561757158262b130837aac5..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356951 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.014888272588203938 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178197 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942397, - "f1": 0.3134878193701723 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.4342760406293567, - "acc_stderr": 0.0049464854665446254, - "acc_norm": 0.5609440350527783, - "acc_norm_stderr": 0.0049525768633152155 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5666929755327546, - "acc_stderr": 0.013926915052757347 - }, - "storycloze_2016": { - "acc": 0.6835916622127205, - "acc_stderr": 0.010754780097940887 - }, - "boolq": { - "acc": 0.5712538226299694, - "acc_stderr": 0.008655800332760226 - }, - "arc_easy": { - "acc": 0.5812289562289562, - "acc_stderr": 0.010123487160167808, - "acc_norm": 0.5437710437710438, - "acc_norm_stderr": 0.010220394383722024 - }, - "arc_challenge": { - "acc": 0.26109215017064846, - "acc_stderr": 0.012835523909473841, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.013214986329274783 - }, - "sciq": { - "acc": 0.871, - "acc_stderr": 0.010605256784796563, - "acc_norm": 0.836, - "acc_norm_stderr": 0.011715000693181325 - }, - "piqa": { - "acc": 0.735582154515778, - "acc_stderr": 0.010289787244767172, - "acc_norm": 0.7301414581066377, - "acc_norm_stderr": 0.010356595421852195 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_2.csv b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..fe72889ba36b1e34b8149d42ba2e07291f3b243c --- /dev/null +++ b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620347,0 +anli_r2,acc,0.338,0.01496596071022448,0 +anli_r3,acc,0.3358333333333333,0.013639261190932877,0 +arc_challenge,acc,0.2696245733788396,0.012968040686869143,0 +arc_challenge,acc_norm,0.28668941979522183,0.013214986329274776,0 +arc_easy,acc,0.5938552188552189,0.010077409815364057,0 +arc_easy,acc_norm,0.5749158249158249,0.010143966195717845,0 +boolq,acc,0.5651376146788991,0.00867052847184156,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.24689440993788817,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.4325831507667795,0.004944215937021391,0 +hellaswag,acc_norm,0.5642302330213105,0.004948439229523912,0 +piqa,acc,0.7415669205658324,0.010213971636773313,0 +piqa,acc_norm,0.7328618063112078,0.010323440492612418,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.886,0.010055103435823332,0 +sciq,acc_norm,0.864,0.010845350230472988,0 +storycloze_2016,acc,0.6846606092998396,0.010744989116260668,0 +winogrande,acc,0.5611681136543015,0.013946933444507032,0 diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index c505b56cb4a2a1d813fb69854e52aef4212f81ec..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620347 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.01496596071022448 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932877 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.24689440993788817 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.4325831507667795, - "acc_stderr": 0.004944215937021391, - "acc_norm": 0.5642302330213105, - "acc_norm_stderr": 0.004948439229523912 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5611681136543015, - "acc_stderr": 0.013946933444507032 - }, - "storycloze_2016": { - "acc": 0.6846606092998396, - "acc_stderr": 0.010744989116260668 - }, - "boolq": { - "acc": 0.5651376146788991, - "acc_stderr": 0.00867052847184156 - }, - "arc_easy": { - "acc": 0.5938552188552189, - "acc_stderr": 0.010077409815364057, - "acc_norm": 0.5749158249158249, - "acc_norm_stderr": 0.010143966195717845 - }, - "arc_challenge": { - "acc": 0.2696245733788396, - "acc_stderr": 0.012968040686869143, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.013214986329274776 - }, - "sciq": { - "acc": 0.886, - "acc_stderr": 0.010055103435823332, - "acc_norm": 0.864, - "acc_norm_stderr": 0.010845350230472988 - }, - "piqa": { - "acc": 0.7415669205658324, - "acc_stderr": 0.010213971636773313, - "acc_norm": 0.7328618063112078, - "acc_norm_stderr": 0.010323440492612418 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_3.csv b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..375a6f13cad2e787b2bee21bb98b8b987365b6c4 --- /dev/null +++ b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.014876872027456732,0 +anli_r2,acc,0.355,0.01513949154378053,0 +anli_r3,acc,0.3283333333333333,0.01356203291952902,0 +arc_challenge,acc,0.27303754266211605,0.013019332762635743,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975576,0 +arc_easy,acc,0.5883838383838383,0.01009821864671491,0 +arc_easy,acc_norm,0.5787037037037037,0.010131882498193127,0 +boolq,acc,0.5477064220183486,0.008705158179072315,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.33040639423618146,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.43178649671380204,0.004943127583290518,0 +hellaswag,acc_norm,0.5643298147779326,0.0049483103997460835,0 +piqa,acc,0.7388465723612623,0.01024873864993558,0 +piqa,acc_norm,0.7393906420021763,0.01024182615581163,0 +rte,acc,0.5703971119133574,0.02979666882912467,0 +sciq,acc,0.881,0.010244215145336664,0 +sciq,acc_norm,0.876,0.010427498872343961,0 +storycloze_2016,acc,0.6910742918225548,0.010684853966268454,0 +winogrande,acc,0.5414364640883977,0.014004146853791902,0 diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index 30ecda1d08d746254f6ece6dc250ecb9d5c776c8..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.014876872027456732 - }, - "anli_r2": { - "acc": 0.355, - "acc_stderr": 0.01513949154378053 - }, - "anli_r3": { - "acc": 0.3283333333333333, - "acc_stderr": 0.01356203291952902 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.33040639423618146 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.43178649671380204, - "acc_stderr": 0.004943127583290518, - "acc_norm": 0.5643298147779326, - "acc_norm_stderr": 0.0049483103997460835 - }, - "rte": { - "acc": 0.5703971119133574, - "acc_stderr": 0.02979666882912467 - }, - "winogrande": { - "acc": 0.5414364640883977, - "acc_stderr": 0.014004146853791902 - }, - "storycloze_2016": { - "acc": 0.6910742918225548, - "acc_stderr": 0.010684853966268454 - }, - "boolq": { - "acc": 0.5477064220183486, - "acc_stderr": 0.008705158179072315 - }, - "arc_easy": { - "acc": 0.5883838383838383, - "acc_stderr": 0.01009821864671491, - "acc_norm": 0.5787037037037037, - "acc_norm_stderr": 0.010131882498193127 - }, - "arc_challenge": { - "acc": 0.27303754266211605, - "acc_stderr": 0.013019332762635743, - "acc_norm": 0.28071672354948807, - "acc_norm_stderr": 0.013131238126975576 - }, - "sciq": { - "acc": 0.881, - "acc_stderr": 0.010244215145336664, - "acc_norm": 0.876, - "acc_norm_stderr": 0.010427498872343961 - }, - "piqa": { - "acc": 0.7388465723612623, - "acc_stderr": 0.01024873864993558, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.01024182615581163 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_4.csv b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..837946e7b8f8a5297097fa38291670b84ec82e16 --- /dev/null +++ b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.338,0.014965960710224489,0 +anli_r3,acc,0.33916666666666667,0.013672343491681815,0 +arc_challenge,acc,0.26109215017064846,0.012835523909473841,0 +arc_challenge,acc_norm,0.2909556313993174,0.013273077865907595,0 +arc_easy,acc,0.5980639730639731,0.010060521220920566,0 +arc_easy,acc_norm,0.5778619528619529,0.01013462052459227,0 +boolq,acc,0.5314984709480123,0.00872768484861531,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2631016042780749,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.4298944433379805,0.004940490508240648,0 +hellaswag,acc_norm,0.5660227046405099,0.004946089230153028,0 +piqa,acc,0.7388465723612623,0.010248738649935573,0 +piqa,acc_norm,0.7442872687704026,0.010178690109459872,0 +rte,acc,0.51985559566787,0.030072723167317184,0 +sciq,acc,0.896,0.009658016218524293,0 +sciq,acc_norm,0.873,0.010534798620855745,0 +storycloze_2016,acc,0.6953500801710315,0.0106434269886468,0 +winogrande,acc,0.5461720599842147,0.013992441563707068,0 diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index 5f8e1ff8a9bed9419797483045c4a83321202eb7..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224489 - }, - "anli_r3": { - "acc": 0.33916666666666667, - "acc_stderr": 0.013672343491681815 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2631016042780749 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.4298944433379805, - "acc_stderr": 0.004940490508240648, - "acc_norm": 0.5660227046405099, - "acc_norm_stderr": 0.004946089230153028 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317184 - }, - "winogrande": { - "acc": 0.5461720599842147, - "acc_stderr": 0.013992441563707068 - }, - "storycloze_2016": { - "acc": 0.6953500801710315, - "acc_stderr": 0.0106434269886468 - }, - "boolq": { - "acc": 0.5314984709480123, - "acc_stderr": 0.00872768484861531 - }, - "arc_easy": { - "acc": 0.5980639730639731, - "acc_stderr": 0.010060521220920566, - "acc_norm": 0.5778619528619529, - "acc_norm_stderr": 0.01013462052459227 - }, - "arc_challenge": { - "acc": 0.26109215017064846, - "acc_stderr": 0.012835523909473841, - "acc_norm": 0.2909556313993174, - "acc_norm_stderr": 0.013273077865907595 - }, - "sciq": { - "acc": 0.896, - "acc_stderr": 0.009658016218524293, - "acc_norm": 0.873, - "acc_norm_stderr": 0.010534798620855745 - }, - "piqa": { - "acc": 0.7388465723612623, - "acc_stderr": 0.010248738649935573, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.010178690109459872 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_5.csv b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..b8ff5df16cf906ea1ce3e82850ba20004907a062 --- /dev/null +++ b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.317,0.014721675438880229,0 +anli_r2,acc,0.362,0.0152048409129195,0 +anli_r3,acc,0.34,0.013680495725767797,0 +arc_challenge,acc,0.26706484641638223,0.012928933196496356,0 +arc_challenge,acc_norm,0.2841296928327645,0.013179442447653887,0 +arc_easy,acc,0.5934343434343434,0.010079056419223523,0 +arc_easy,acc_norm,0.5803872053872053,0.010126315840891536,0 +boolq,acc,0.5262996941896024,0.008732949144494798,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.31333333333333335,,1 +copa,acc,0.72,0.045126085985421276,0 +hellaswag,acc,0.43218482374029077,0.004943673388276271,0 +hellaswag,acc_norm,0.5679147580163314,0.004943537242344413,0 +piqa,acc,0.7295973884657236,0.010363167031620789,0 +piqa,acc_norm,0.7388465723612623,0.010248738649935592,0 +rte,acc,0.5848375451263538,0.02966006629089348,0 +sciq,acc,0.899,0.009533618929340997,0 +sciq,acc_norm,0.888,0.00997775303139725,0 +storycloze_2016,acc,0.692143238909674,0.010674598158758175,0 +winogrande,acc,0.55327545382794,0.0139724883716167,0 diff --git a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index 7c81fe5db40ba1177fb4659c5f3b5f8ad8529d81..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed1/evaluation/rankeval/2b855b18bc4seed1_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.317, - "acc_stderr": 0.014721675438880229 - }, - "anli_r2": { - "acc": 0.362, - "acc_stderr": 0.0152048409129195 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767797 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.31333333333333335 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.045126085985421276 - }, - "hellaswag": { - "acc": 0.43218482374029077, - "acc_stderr": 0.004943673388276271, - "acc_norm": 0.5679147580163314, - "acc_norm_stderr": 0.004943537242344413 - }, - "rte": { - "acc": 0.5848375451263538, - "acc_stderr": 0.02966006629089348 - }, - "winogrande": { - "acc": 0.55327545382794, - "acc_stderr": 0.0139724883716167 - }, - "storycloze_2016": { - "acc": 0.692143238909674, - "acc_stderr": 0.010674598158758175 - }, - "boolq": { - "acc": 0.5262996941896024, - "acc_stderr": 0.008732949144494798 - }, - "arc_easy": { - "acc": 0.5934343434343434, - "acc_stderr": 0.010079056419223523, - "acc_norm": 0.5803872053872053, - "acc_norm_stderr": 0.010126315840891536 - }, - "arc_challenge": { - "acc": 0.26706484641638223, - "acc_stderr": 0.012928933196496356, - "acc_norm": 0.2841296928327645, - "acc_norm_stderr": 0.013179442447653887 - }, - "sciq": { - "acc": 0.899, - "acc_stderr": 0.009533618929340997, - "acc_norm": 0.888, - "acc_norm_stderr": 0.00997775303139725 - }, - "piqa": { - "acc": 0.7295973884657236, - "acc_stderr": 0.010363167031620789, - "acc_norm": 0.7388465723612623, - "acc_norm_stderr": 0.010248738649935592 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_0.csv b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..5e67147929498e8c6ce9007b6d1b2200bf31def8 --- /dev/null +++ b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121731,0 +anli_r2,acc,0.333,0.014910846164229864,0 +anli_r3,acc,0.33166666666666667,0.013596836729485159,0 +arc_challenge,acc,0.2508532423208191,0.012668198621315433,0 +arc_challenge,acc_norm,0.2790102389078498,0.013106784883601341,0 +arc_easy,acc,0.5622895622895623,0.01017985648600691,0 +arc_easy,acc_norm,0.49747474747474746,0.010259652668783474,0 +boolq,acc,0.5957186544342508,0.008583313811372064,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.27398545625146614,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.43507269468233417,0.004947533158712096,0 +hellaswag,acc_norm,0.5631348336984664,0.004949842967331434,0 +piqa,acc,0.7372143634385201,0.010269354068140767,0 +piqa,acc_norm,0.7442872687704026,0.010178690109459876,0 +rte,acc,0.5342960288808665,0.030025579819366422,0 +sciq,acc,0.814,0.012310790208412805,0 +sciq,acc_norm,0.725,0.014127086556490528,0 +storycloze_2016,acc,0.694815606627472,0.010648664383985656,0 +winogrande,acc,0.5564325177584846,0.013962694907620397,0 diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index 83069dd3fa2c8f8f31fd817d5e9a9e7df13576c3..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229864 - }, - "anli_r3": { - "acc": 0.33166666666666667, - "acc_stderr": 0.013596836729485159 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.27398545625146614 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.43507269468233417, - "acc_stderr": 0.004947533158712096, - "acc_norm": 0.5631348336984664, - "acc_norm_stderr": 0.004949842967331434 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366422 - }, - "winogrande": { - "acc": 0.5564325177584846, - "acc_stderr": 0.013962694907620397 - }, - "storycloze_2016": { - "acc": 0.694815606627472, - "acc_stderr": 0.010648664383985656 - }, - "boolq": { - "acc": 0.5957186544342508, - "acc_stderr": 0.008583313811372064 - }, - "arc_easy": { - "acc": 0.5622895622895623, - "acc_stderr": 0.01017985648600691, - "acc_norm": 0.49747474747474746, - "acc_norm_stderr": 0.010259652668783474 - }, - "arc_challenge": { - "acc": 0.2508532423208191, - "acc_stderr": 0.012668198621315433, - "acc_norm": 0.2790102389078498, - "acc_norm_stderr": 0.013106784883601341 - }, - "sciq": { - "acc": 0.814, - "acc_stderr": 0.012310790208412805, - "acc_norm": 0.725, - "acc_norm_stderr": 0.014127086556490528 - }, - "piqa": { - "acc": 0.7372143634385201, - "acc_stderr": 0.010269354068140767, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.010178690109459876 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_1.csv b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..6665d34c47469b34e22d48a52c5d9a16f1779b87 --- /dev/null +++ b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.01487687202745673,0 +anli_r2,acc,0.319,0.014746404865473475,0 +anli_r3,acc,0.3375,0.013655897185463657,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207706,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0 +arc_easy,acc,0.5765993265993266,0.010138671005289045,0 +arc_easy,acc_norm,0.5332491582491582,0.010237073872130745,0 +boolq,acc,0.5825688073394495,0.008624990050216677,1 +cb,acc,0.5535714285714286,0.06703189227942397,1 +cb,f1,0.3821828472991264,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.43218482374029077,0.004943673388276274,0 +hellaswag,acc_norm,0.5641306512646883,0.004948567856373859,0 +piqa,acc,0.7377584330794341,0.010262502565172449,0 +piqa,acc_norm,0.7415669205658324,0.010213971636773308,0 +rte,acc,0.5270758122743683,0.0300523034631437,0 +sciq,acc,0.844,0.011480235006122358,0 +sciq,acc_norm,0.822,0.01210216767618359,0 +storycloze_2016,acc,0.6905398182789952,0.010689956745189074,0 +winogrande,acc,0.5509076558800315,0.013979459389140844,0 diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json deleted file mode 100644 index 02593d0df59960de4fe6b3bfcb7ffd3ce313fd90..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.01487687202745673 - }, - "anli_r2": { - "acc": 0.319, - "acc_stderr": 0.014746404865473475 - }, - "anli_r3": { - "acc": 0.3375, - "acc_stderr": 0.013655897185463657 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942397, - "f1": 0.3821828472991264 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.43218482374029077, - "acc_stderr": 0.004943673388276274, - "acc_norm": 0.5641306512646883, - "acc_norm_stderr": 0.004948567856373859 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.0300523034631437 - }, - "winogrande": { - "acc": 0.5509076558800315, - "acc_stderr": 0.013979459389140844 - }, - "storycloze_2016": { - "acc": 0.6905398182789952, - "acc_stderr": 0.010689956745189074 - }, - "boolq": { - "acc": 0.5825688073394495, - "acc_stderr": 0.008624990050216677 - }, - "arc_easy": { - "acc": 0.5765993265993266, - "acc_stderr": 0.010138671005289045, - "acc_norm": 0.5332491582491582, - "acc_norm_stderr": 0.010237073872130745 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207706, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097222 - }, - "sciq": { - "acc": 0.844, - "acc_stderr": 0.011480235006122358, - "acc_norm": 0.822, - "acc_norm_stderr": 0.01210216767618359 - }, - "piqa": { - "acc": 0.7377584330794341, - "acc_stderr": 0.010262502565172449, - "acc_norm": 0.7415669205658324, - "acc_norm_stderr": 0.010213971636773308 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_2.csv b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..d9ce098c4a647c5ff34fc026cf991e2c1a2f0b34 --- /dev/null +++ b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.305,0.014566646394664397,0 +anli_r2,acc,0.33,0.01487687202745673,0 +anli_r3,acc,0.30916666666666665,0.013346684134591951,0 +arc_challenge,acc,0.25597269624573377,0.012753013241244528,0 +arc_challenge,acc_norm,0.295221843003413,0.01332975029338232,0 +arc_easy,acc,0.5871212121212122,0.010102837421104661,0 +arc_easy,acc_norm,0.5551346801346801,0.010197216690356423,0 +boolq,acc,0.5929663608562691,0.008592562887068868,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.291005291005291,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.4313881696873133,0.00494257852098735,0 +hellaswag,acc_norm,0.5652260505875324,0.004947141797384123,0 +piqa,acc,0.736126224156692,0.010282996367695566,0 +piqa,acc_norm,0.7377584330794341,0.010262502565172438,0 +rte,acc,0.5667870036101083,0.02982676408213827,0 +sciq,acc,0.866,0.01077776229836968,0 +sciq,acc_norm,0.865,0.010811655372416051,0 +storycloze_2016,acc,0.6926777124532336,0.01066944508186666,0 +winogrande,acc,0.5524861878453039,0.013974847640536194,0 diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json deleted file mode 100644 index 050d454c87ac53eceec8578f21101648c45586e8..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.305, - "acc_stderr": 0.014566646394664397 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.01487687202745673 - }, - "anli_r3": { - "acc": 0.30916666666666665, - "acc_stderr": 0.013346684134591951 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.291005291005291 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.4313881696873133, - "acc_stderr": 0.00494257852098735, - "acc_norm": 0.5652260505875324, - "acc_norm_stderr": 0.004947141797384123 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.02982676408213827 - }, - "winogrande": { - "acc": 0.5524861878453039, - "acc_stderr": 0.013974847640536194 - }, - "storycloze_2016": { - "acc": 0.6926777124532336, - "acc_stderr": 0.01066944508186666 - }, - "boolq": { - "acc": 0.5929663608562691, - "acc_stderr": 0.008592562887068868 - }, - "arc_easy": { - "acc": 0.5871212121212122, - "acc_stderr": 0.010102837421104661, - "acc_norm": 0.5551346801346801, - "acc_norm_stderr": 0.010197216690356423 - }, - "arc_challenge": { - "acc": 0.25597269624573377, - "acc_stderr": 0.012753013241244528, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.01332975029338232 - }, - "sciq": { - "acc": 0.866, - "acc_stderr": 0.01077776229836968, - "acc_norm": 0.865, - "acc_norm_stderr": 0.010811655372416051 - }, - "piqa": { - "acc": 0.736126224156692, - "acc_stderr": 0.010282996367695566, - "acc_norm": 0.7377584330794341, - "acc_norm_stderr": 0.010262502565172438 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_3.csv b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..8021f2eb87d8375a42478a664e2b0eb78f310796 --- /dev/null +++ b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.309,0.014619600977206488,0 +anli_r2,acc,0.357,0.015158521721486767,0 +anli_r3,acc,0.3283333333333333,0.013562032919529019,0 +arc_challenge,acc,0.26621160409556316,0.012915774781523203,0 +arc_challenge,acc_norm,0.2858361774744027,0.01320319608853737,0 +arc_easy,acc,0.5812289562289562,0.010123487160167812,0 +arc_easy,acc_norm,0.5568181818181818,0.010193324837773495,0 +boolq,acc,0.5899082568807339,0.008602512053254421,1 +cb,acc,0.5178571428571429,0.06737697508644648,1 +cb,f1,0.4183760683760684,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.43168691495717987,0.004942990623131124,0 +hellaswag,acc_norm,0.5659231228838877,0.004946221512145287,0 +piqa,acc,0.7372143634385201,0.010269354068140767,0 +piqa,acc_norm,0.749183895538629,0.010113869547069047,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.876,0.01042749887234396,0 +sciq,acc_norm,0.857,0.011075814808567038,0 +storycloze_2016,acc,0.6889363976483164,0.010705164869803167,0 +winogrande,acc,0.5509076558800315,0.013979459389140842,0 diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json deleted file mode 100644 index 226dbff7031f9426e830a5758af50238f67fe072..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.309, - "acc_stderr": 0.014619600977206488 - }, - "anli_r2": { - "acc": 0.357, - "acc_stderr": 0.015158521721486767 - }, - "anli_r3": { - "acc": 0.3283333333333333, - "acc_stderr": 0.013562032919529019 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644648, - "f1": 0.4183760683760684 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.43168691495717987, - "acc_stderr": 0.004942990623131124, - "acc_norm": 0.5659231228838877, - "acc_norm_stderr": 0.004946221512145287 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5509076558800315, - "acc_stderr": 0.013979459389140842 - }, - "storycloze_2016": { - "acc": 0.6889363976483164, - "acc_stderr": 0.010705164869803167 - }, - "boolq": { - "acc": 0.5899082568807339, - "acc_stderr": 0.008602512053254421 - }, - "arc_easy": { - "acc": 0.5812289562289562, - "acc_stderr": 0.010123487160167812, - "acc_norm": 0.5568181818181818, - "acc_norm_stderr": 0.010193324837773495 - }, - "arc_challenge": { - "acc": 0.26621160409556316, - "acc_stderr": 0.012915774781523203, - "acc_norm": 0.2858361774744027, - "acc_norm_stderr": 0.01320319608853737 - }, - "sciq": { - "acc": 0.876, - "acc_stderr": 0.01042749887234396, - "acc_norm": 0.857, - "acc_norm_stderr": 0.011075814808567038 - }, - "piqa": { - "acc": 0.7372143634385201, - "acc_stderr": 0.010269354068140767, - "acc_norm": 0.749183895538629, - "acc_norm_stderr": 0.010113869547069047 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_4.csv b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..8b7786fca823a5a71ee12ef3d911b9b15720f10b --- /dev/null +++ b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363935,0 +anli_r2,acc,0.354,0.015129868238451773,0 +anli_r3,acc,0.345,0.013728421539454876,0 +arc_challenge,acc,0.25597269624573377,0.012753013241244525,0 +arc_challenge,acc_norm,0.29180887372013653,0.013284525292403504,0 +arc_easy,acc,0.5854377104377104,0.010108889212447774,0 +arc_easy,acc_norm,0.5572390572390572,0.010192333348394459,0 +boolq,acc,0.5896024464831804,0.008603488048617517,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3213507625272331,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.42929695279824737,0.004939642460172587,0 +hellaswag,acc_norm,0.5668193586934873,0.004945023657032274,0 +piqa,acc,0.7399347116430903,0.010234893249061301,0 +piqa,acc_norm,0.7388465723612623,0.010248738649935588,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.883,0.010169287802713329,0 +sciq,acc_norm,0.871,0.010605256784796582,0 +storycloze_2016,acc,0.6910742918225548,0.01068485396626845,0 +winogrande,acc,0.5493291239147593,0.01398392886904024,0 diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json deleted file mode 100644 index 8e7db339df6b8a974525abc43bbf09325a7f9057..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363935 - }, - "anli_r2": { - "acc": 0.354, - "acc_stderr": 0.015129868238451773 - }, - "anli_r3": { - "acc": 0.345, - "acc_stderr": 0.013728421539454876 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3213507625272331 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.42929695279824737, - "acc_stderr": 0.004939642460172587, - "acc_norm": 0.5668193586934873, - "acc_norm_stderr": 0.004945023657032274 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5493291239147593, - "acc_stderr": 0.01398392886904024 - }, - "storycloze_2016": { - "acc": 0.6910742918225548, - "acc_stderr": 0.01068485396626845 - }, - "boolq": { - "acc": 0.5896024464831804, - "acc_stderr": 0.008603488048617517 - }, - "arc_easy": { - "acc": 0.5854377104377104, - "acc_stderr": 0.010108889212447774, - "acc_norm": 0.5572390572390572, - "acc_norm_stderr": 0.010192333348394459 - }, - "arc_challenge": { - "acc": 0.25597269624573377, - "acc_stderr": 0.012753013241244525, - "acc_norm": 0.29180887372013653, - "acc_norm_stderr": 0.013284525292403504 - }, - "sciq": { - "acc": 0.883, - "acc_stderr": 0.010169287802713329, - "acc_norm": 0.871, - "acc_norm_stderr": 0.010605256784796582 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.010234893249061301, - "acc_norm": 0.7388465723612623, - "acc_norm_stderr": 0.010248738649935588 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_5.csv b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..9bf17acf2d9701186528e82a883147264edb9b2c --- /dev/null +++ b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.351,0.015100563798316402,0 +anli_r2,acc,0.338,0.014965960710224489,0 +anli_r3,acc,0.33166666666666667,0.013596836729485176,0 +arc_challenge,acc,0.26535836177474403,0.012902554762313966,0 +arc_challenge,acc_norm,0.29266211604095566,0.013295916103619415,0 +arc_easy,acc,0.5904882154882155,0.01009036816099006,0 +arc_easy,acc_norm,0.5627104377104377,0.010178768429321592,0 +boolq,acc,0.5902140672782875,0.008601532621213522,1 +cb,acc,0.5714285714285714,0.06672848092813058,1 +cb,f1,0.3624429223744292,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.42949611631149176,0.004939925958728871,0 +hellaswag,acc_norm,0.566620195180243,0.0049452912700724315,0 +piqa,acc,0.735038084874864,0.01029655799331605,0 +piqa,acc_norm,0.735582154515778,0.010289787244767161,0 +rte,acc,0.5523465703971119,0.029931070362939526,0 +sciq,acc,0.888,0.00997775303139725,0 +sciq,acc_norm,0.881,0.010244215145336664,0 +storycloze_2016,acc,0.6916087653661144,0.010679734445487794,0 +winogrande,acc,0.5556432517758485,0.013965196769083555,0 diff --git a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json b/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json deleted file mode 100644 index 56b69b9942cd99a5a2447e34a1a8c665f671d697..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed2/evaluation/rankeval/2b855b18bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316402 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224489 - }, - "anli_r3": { - "acc": 0.33166666666666667, - "acc_stderr": 0.013596836729485176 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.3624429223744292 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.42949611631149176, - "acc_stderr": 0.004939925958728871, - "acc_norm": 0.566620195180243, - "acc_norm_stderr": 0.0049452912700724315 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5556432517758485, - "acc_stderr": 0.013965196769083555 - }, - "storycloze_2016": { - "acc": 0.6916087653661144, - "acc_stderr": 0.010679734445487794 - }, - "boolq": { - "acc": 0.5902140672782875, - "acc_stderr": 0.008601532621213522 - }, - "arc_easy": { - "acc": 0.5904882154882155, - "acc_stderr": 0.01009036816099006, - "acc_norm": 0.5627104377104377, - "acc_norm_stderr": 0.010178768429321592 - }, - "arc_challenge": { - "acc": 0.26535836177474403, - "acc_stderr": 0.012902554762313966, - "acc_norm": 0.29266211604095566, - "acc_norm_stderr": 0.013295916103619415 - }, - "sciq": { - "acc": 0.888, - "acc_stderr": 0.00997775303139725, - "acc_norm": 0.881, - "acc_norm_stderr": 0.010244215145336664 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.01029655799331605, - "acc_norm": 0.735582154515778, - "acc_norm_stderr": 0.010289787244767161 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/generation/merged.csv b/2b855b18bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..bdc41f0dca30a12ad504974014ce3dd410eb9544 --- /dev/null +++ b/2b855b18bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00033458548372763047 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00033458548372763047 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.15117396956389217 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.15117396956389217 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1689945125341207 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1689945125341207 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.18929612249057426 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18929612249057426 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1962921120031527 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1962921120031527 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19877083959875572 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19877083959875572 +e2e_nlg_cleaned,5,average,multiple,0.15081035694570386 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04301195793696945 +gem_xsum,0,median,rouge2_fmeasure,0.04301195793696945 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.02999018501144029 +gem_xsum,1,median,rouge2_fmeasure,0.02999018501144029 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.031359702416680915 +gem_xsum,2,median,rouge2_fmeasure,0.031359702416680915 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.02981894291789061 +gem_xsum,3,median,rouge2_fmeasure,0.02981894291789061 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.00898850210998665 +gem_xsum,4,median,rouge2_fmeasure,0.00898850210998665 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.000236662827910983 +gem_xsum,5,median,rouge2_fmeasure,0.000236662827910983 +gem_xsum,5,average,multiple,0.023900992203479816 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05301792033252035 +web_nlg_en,0,median,rouge2_fmeasure,0.05301792033252035 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0503949121856254 +web_nlg_en,1,median,rouge2_fmeasure,0.0503949121856254 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05281996574069844 +web_nlg_en,2,median,rouge2_fmeasure,0.05281996574069844 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05291603993699626 +web_nlg_en,3,median,rouge2_fmeasure,0.05291603993699626 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05450572513549272 +web_nlg_en,4,median,rouge2_fmeasure,0.05450572513549272 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.0554870860380319 +web_nlg_en,5,median,rouge2_fmeasure,0.0554870860380319 +web_nlg_en,5,average,multiple,0.053190274894894175 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03689634101691613 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03689634101691613 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.042953733672235817 +wiki_lingua_en,1,median,rouge2_fmeasure,0.042953733672235817 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.041658085133235837 +wiki_lingua_en,2,median,rouge2_fmeasure,0.041658085133235837 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03537230832895332 +wiki_lingua_en,3,median,rouge2_fmeasure,0.03537230832895332 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.011291919512703433 +wiki_lingua_en,4,median,rouge2_fmeasure,0.011291919512703433 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0018401062454758718 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0018401062454758718 +wiki_lingua_en,5,average,multiple,0.028335415651586735 diff --git a/2b855b18bc4seed3/evaluation/generation/merged.json b/2b855b18bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..92128549595ba0021d1c619c94af4f47609399ef --- /dev/null +++ b/2b855b18bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3731476324585187, "bleu_stderr": 0.04279095345810554, "rouge1_fmeasure": 0.11158760019639265, "rouge1_fmeasure_stderr": 0.00219301047235451, "rouge1_precision": 0.07329265014300652, "rouge1_precision_stderr": 0.0016523927328479732, "rouge1_recall": 0.30809933469733186, "rouge1_recall_stderr": 0.004750111252627495, "rouge2_fmeasure": 0.05301792033252035, "rouge2_fmeasure_stderr": 0.001363804412492064, "rouge2_precision": 0.03471754161461186, "rouge2_precision_stderr": 0.0010024286124792183, "rouge2_recall": 0.15050857846465385, "rouge2_recall_stderr": 0.0032894250495978127, "rougeL_fmeasure": 0.10700498670828301, "rougeL_fmeasure_stderr": 0.0019964714146835523, "rougeL_precision": 0.06987268977786676, "rougeL_precision_stderr": 0.0014699869446300976, "rougeL_recall": 0.2992292062662937, "rougeL_recall_stderr": 0.004605281917079719, "rougeLsum_fmeasure": 0.10657396102935507, "rougeLsum_fmeasure_stderr": 0.0020497326851297943, "rougeLsum_precision": 0.06991258389741457, "rougeLsum_precision_stderr": 0.0015389933617200125, "rougeLsum_recall": 0.2950310011434255, "rougeLsum_recall_stderr": 0.004469935702654606}}, "1": {"PALM_prompt": {"bleu": 0.42555609648238263, "bleu_stderr": 0.03754313545614141, "rouge1_fmeasure": 0.11068664903027482, "rouge1_fmeasure_stderr": 0.001962491500415961, "rouge1_precision": 0.07184378890189147, "rouge1_precision_stderr": 0.0016335300613952862, "rouge1_recall": 0.35126095559507303, "rouge1_recall_stderr": 0.005146366380570411, "rouge2_fmeasure": 0.0503949121856254, "rouge2_fmeasure_stderr": 0.001195935930031272, "rouge2_precision": 0.03318926992483388, "rouge2_precision_stderr": 0.0011444719487414805, "rouge2_recall": 0.16588217417695206, "rouge2_recall_stderr": 0.003496202970566127, "rougeL_fmeasure": 0.10407818788613064, "rougeL_fmeasure_stderr": 0.0017679127942389118, "rougeL_precision": 0.06741529955427898, "rougeL_precision_stderr": 0.0014815628323586328, "rougeL_recall": 0.3301177743921885, "rougeL_recall_stderr": 0.004743764788988533, "rougeLsum_fmeasure": 0.1051153938081113, "rougeLsum_fmeasure_stderr": 0.0018415297720731129, "rougeLsum_precision": 0.06826621551722116, "rougeLsum_precision_stderr": 0.0015448182246924134, "rougeLsum_recall": 0.33222271225825306, "rougeLsum_recall_stderr": 0.004775750907297394}}, "2": {"PALM_prompt": {"bleu": 0.4652568801554802, "bleu_stderr": 0.02086448405693204, "rouge1_fmeasure": 0.11711231568686131, "rouge1_fmeasure_stderr": 0.0020215180535217707, "rouge1_precision": 0.07599043467449453, "rouge1_precision_stderr": 0.0016845923026391355, "rouge1_recall": 0.377618766881284, "rouge1_recall_stderr": 0.005209532164166404, "rouge2_fmeasure": 0.05281996574069844, "rouge2_fmeasure_stderr": 0.0012924121652213556, "rouge2_precision": 0.034348379918539965, "rouge2_precision_stderr": 0.001032693208366895, "rouge2_recall": 0.17852124341138678, "rouge2_recall_stderr": 0.0036138800776794387, "rougeL_fmeasure": 0.10835705961075051, "rougeL_fmeasure_stderr": 0.0018096831755968982, "rougeL_precision": 0.07005773894408303, "rougeL_precision_stderr": 0.0014598106997970595, "rougeL_recall": 0.3481842621327449, "rougeL_recall_stderr": 0.004649995361878229, "rougeLsum_fmeasure": 0.1107610087186827, "rougeLsum_fmeasure_stderr": 0.0018799867889272243, "rougeLsum_precision": 0.07171742871280155, "rougeLsum_precision_stderr": 0.0015260807660963526, "rougeLsum_recall": 0.35646781939128613, "rougeLsum_recall_stderr": 0.004810915232012511}}, "3": {"PALM_prompt": {"bleu": 0.5365207319877168, "bleu_stderr": 0.034842554087089736, "rouge1_fmeasure": 0.11695362837750037, "rouge1_fmeasure_stderr": 0.0018463734974498126, "rouge1_precision": 0.07558089543478737, "rouge1_precision_stderr": 0.0015729141123758212, "rouge1_recall": 0.38517914869842684, "rouge1_recall_stderr": 0.005179305028690691, "rouge2_fmeasure": 0.05291603993699626, "rouge2_fmeasure_stderr": 0.0011336399045749435, "rouge2_precision": 0.03381949808202605, "rouge2_precision_stderr": 0.0008604160386549045, "rouge2_recall": 0.18680399389471034, "rouge2_recall_stderr": 0.0037437050725559456, "rougeL_fmeasure": 0.10758336318058453, "rougeL_fmeasure_stderr": 0.001640730488763228, "rougeL_precision": 0.06941793978395291, "rougeL_precision_stderr": 0.001383924108922276, "rougeL_recall": 0.3537075955085191, "rougeL_recall_stderr": 0.00463326950106869, "rougeLsum_fmeasure": 0.11021067093653043, "rougeLsum_fmeasure_stderr": 0.0017201844997272798, "rougeLsum_precision": 0.07123063943054782, "rougeLsum_precision_stderr": 0.0014639316194415596, "rougeLsum_recall": 0.36244464139095184, "rougeLsum_recall_stderr": 0.004771668346501126}}, "4": {"PALM_prompt": {"bleu": 0.5688726039813448, "bleu_stderr": 0.03499549171771342, "rouge1_fmeasure": 0.12058598866659478, "rouge1_fmeasure_stderr": 0.001957868765398207, "rouge1_precision": 0.07795866646728995, "rouge1_precision_stderr": 0.0016639249531194892, "rouge1_recall": 0.3938155496452724, "rouge1_recall_stderr": 0.005078565192845451, "rouge2_fmeasure": 0.05450572513549272, "rouge2_fmeasure_stderr": 0.001253634240695061, "rouge2_precision": 0.03502553480612897, "rouge2_precision_stderr": 0.001028310626943955, "rouge2_recall": 0.18935831162471692, "rouge2_recall_stderr": 0.003633774069833336, "rougeL_fmeasure": 0.10978912137531217, "rougeL_fmeasure_stderr": 0.0017186306906886038, "rougeL_precision": 0.0710005783099636, "rougeL_precision_stderr": 0.0014996741097328555, "rougeL_recall": 0.3583987044932477, "rougeL_recall_stderr": 0.004478089320099575, "rougeLsum_fmeasure": 0.11358139308736688, "rougeLsum_fmeasure_stderr": 0.0018246864360938692, "rougeLsum_precision": 0.07354240936015581, "rougeLsum_precision_stderr": 0.0015810643210363616, "rougeLsum_recall": 0.37001495785797983, "rougeLsum_recall_stderr": 0.004663135619148296}}, "5": {"PALM_prompt": {"bleu": 0.6580128403383632, "bleu_stderr": 0.03979048740937252, "rouge1_fmeasure": 0.12120653952290801, "rouge1_fmeasure_stderr": 0.0018925406906279182, "rouge1_precision": 0.07882434482397714, "rouge1_precision_stderr": 0.0017453979591356575, "rouge1_recall": 0.4094912006873992, "rouge1_recall_stderr": 0.005322884073369286, "rouge2_fmeasure": 0.0554870860380319, "rouge2_fmeasure_stderr": 0.0011808802101015506, "rouge2_precision": 0.035869586954036733, "rouge2_precision_stderr": 0.0010232674681638256, "rouge2_recall": 0.20177695925262631, "rouge2_recall_stderr": 0.0038555438049923973, "rougeL_fmeasure": 0.1101712907081191, "rougeL_fmeasure_stderr": 0.0016797733697701302, "rougeL_precision": 0.07164290907529088, "rougeL_precision_stderr": 0.0015569758175689177, "rougeL_recall": 0.37103848909343684, "rougeL_recall_stderr": 0.004672357156707711, "rougeLsum_fmeasure": 0.11376874556323541, "rougeLsum_fmeasure_stderr": 0.0017606903173948368, "rougeLsum_precision": 0.07402169551091484, "rougeLsum_precision_stderr": 0.001622090088093523, "rougeLsum_recall": 0.3835426091505346, "rougeLsum_recall_stderr": 0.004850424234174864}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5441171855074767, "bleu_stderr": 0.058468990046918884, "rouge1_fmeasure": 0.17939214488610158, "rouge1_fmeasure_stderr": 0.0017804135350923409, "rouge1_precision": 0.15296865060249892, "rouge1_precision_stderr": 0.001837820952109684, "rouge1_recall": 0.26169124842748104, "rouge1_recall_stderr": 0.0025668809491873465, "rouge2_fmeasure": 0.03689634101691613, "rouge2_fmeasure_stderr": 0.0008335480154075091, "rouge2_precision": 0.03107472850516476, "rouge2_precision_stderr": 0.0007326879926008961, "rouge2_recall": 0.055773792128310504, "rouge2_recall_stderr": 0.0013801002433587823, "rougeL_fmeasure": 0.14190888358418866, "rougeL_fmeasure_stderr": 0.0012823537558295083, "rougeL_precision": 0.11938010040345436, "rougeL_precision_stderr": 0.001285307745098763, "rougeL_recall": 0.21229112067417366, "rougeL_recall_stderr": 0.002133442854384033, "rougeLsum_fmeasure": 0.16410222290223417, "rougeLsum_fmeasure_stderr": 0.0016141436958916839, "rougeLsum_precision": 0.13960767712336827, "rougeLsum_precision_stderr": 0.0016604976751595994, "rougeLsum_recall": 0.24051478150650776, "rougeLsum_recall_stderr": 0.0023804859164215262}}, "1": {"tldr_en": {"bleu": 2.0371472726893503, "bleu_stderr": 0.06445886557437613, "rouge1_fmeasure": 0.193112078225177, "rouge1_fmeasure_stderr": 0.0018971076935866038, "rouge1_precision": 0.17036737673960853, "rouge1_precision_stderr": 0.0020899573553767622, "rouge1_recall": 0.27375019590132627, "rouge1_recall_stderr": 0.002747096641084084, "rouge2_fmeasure": 0.042953733672235817, "rouge2_fmeasure_stderr": 0.000924164525215074, "rouge2_precision": 0.037939965824224105, "rouge2_precision_stderr": 0.0009381884690303104, "rouge2_recall": 0.06322413550907502, "rouge2_recall_stderr": 0.0015398866762030678, "rougeL_fmeasure": 0.14372809988310234, "rougeL_fmeasure_stderr": 0.0013224587565042005, "rougeL_precision": 0.12589556107132205, "rougeL_precision_stderr": 0.0014649128169422912, "rougeL_recall": 0.20769894415074885, "rougeL_recall_stderr": 0.002141105521843318, "rougeLsum_fmeasure": 0.18079495694560394, "rougeLsum_fmeasure_stderr": 0.0017666409264553958, "rougeLsum_precision": 0.15926144022330607, "rougeLsum_precision_stderr": 0.0019462692771155467, "rougeLsum_recall": 0.257050138374747, "rougeLsum_recall_stderr": 0.0025933226058397914}}, "2": {"tldr_en": {"bleu": 2.1035233164263456, "bleu_stderr": 0.06526356006576682, "rouge1_fmeasure": 0.1884485062116365, "rouge1_fmeasure_stderr": 0.001860450880454973, "rouge1_precision": 0.1734228018426794, "rouge1_precision_stderr": 0.0021870294162637167, "rouge1_recall": 0.25836940670007313, "rouge1_recall_stderr": 0.002647909074428666, "rouge2_fmeasure": 0.041658085133235837, "rouge2_fmeasure_stderr": 0.0009383498686193238, "rouge2_precision": 0.03841408476515862, "rouge2_precision_stderr": 0.0009912250995392628, "rouge2_recall": 0.058457900030295655, "rouge2_recall_stderr": 0.0014624713050720476, "rougeL_fmeasure": 0.14294429727362473, "rougeL_fmeasure_stderr": 0.00132783588949218, "rougeL_precision": 0.13094043101023076, "rougeL_precision_stderr": 0.0015815373573667445, "rougeL_recall": 0.19902411964243416, "rougeL_recall_stderr": 0.002086625916324095, "rougeLsum_fmeasure": 0.17699619195337532, "rougeLsum_fmeasure_stderr": 0.0017386037119207789, "rougeLsum_precision": 0.1626799656018026, "rougeLsum_precision_stderr": 0.002048151665068223, "rougeLsum_recall": 0.24348103096563084, "rougeLsum_recall_stderr": 0.0025140622431989454}}, "3": {"tldr_en": {"bleu": 2.0708386333938336, "bleu_stderr": 0.08532653212074892, "rouge1_fmeasure": 0.15529212766897169, "rouge1_fmeasure_stderr": 0.002079002689891638, "rouge1_precision": 0.1506568705346248, "rouge1_precision_stderr": 0.0025215780511801873, "rouge1_recall": 0.2116483891779261, "rouge1_recall_stderr": 0.003008853810510804, "rouge2_fmeasure": 0.03537230832895332, "rouge2_fmeasure_stderr": 0.0009189287842390869, "rouge2_precision": 0.03420162698259617, "rouge2_precision_stderr": 0.0010666091585980308, "rouge2_recall": 0.04960569870499745, "rouge2_recall_stderr": 0.0014340401382855095, "rougeL_fmeasure": 0.11968918711026949, "rougeL_fmeasure_stderr": 0.0015628721855696266, "rougeL_precision": 0.11650125992583188, "rougeL_precision_stderr": 0.0019973810684907898, "rougeL_recall": 0.16549229904821522, "rougeL_recall_stderr": 0.002413998964797247, "rougeLsum_fmeasure": 0.14571157748768584, "rougeLsum_fmeasure_stderr": 0.001941605436782365, "rougeLsum_precision": 0.14140024751156657, "rougeLsum_precision_stderr": 0.0023760171960274692, "rougeLsum_recall": 0.1992814333476404, "rougeLsum_recall_stderr": 0.002849664130133249}}, "4": {"tldr_en": {"bleu": 0.49445935730024393, "bleu_stderr": 0.024088624195397753, "rouge1_fmeasure": 0.05022387283014506, "rouge1_fmeasure_stderr": 0.0017512843671334136, "rouge1_precision": 0.05176861984860057, "rouge1_precision_stderr": 0.0021271263706370393, "rouge1_recall": 0.071136841878653, "rouge1_recall_stderr": 0.002558672934489541, "rouge2_fmeasure": 0.011291919512703433, "rouge2_fmeasure_stderr": 0.0006114676347614693, "rouge2_precision": 0.01196007994993277, "rouge2_precision_stderr": 0.0009013123264068118, "rouge2_recall": 0.016658477991511225, "rouge2_recall_stderr": 0.0009671439179288056, "rougeL_fmeasure": 0.03946724877727848, "rougeL_fmeasure_stderr": 0.0013608633103211994, "rougeL_precision": 0.04109240219201992, "rougeL_precision_stderr": 0.0017283133061432232, "rougeL_recall": 0.056569051990477744, "rougeL_recall_stderr": 0.0020487608971683237, "rougeLsum_fmeasure": 0.047043103436935976, "rougeLsum_fmeasure_stderr": 0.0016452355930115614, "rougeLsum_precision": 0.04852726679811264, "rougeLsum_precision_stderr": 0.0020106742342935737, "rougeLsum_recall": 0.0668119942082139, "rougeLsum_recall_stderr": 0.0024159420253348383}}, "5": {"tldr_en": {"bleu": 1.6856105428368102e-06, "bleu_stderr": 3.880121959798532e-06, "rouge1_fmeasure": 0.008055512719990391, "rouge1_fmeasure_stderr": 0.0007587491977737073, "rouge1_precision": 0.008470699600571485, "rouge1_precision_stderr": 0.0009200886254866315, "rouge1_recall": 0.0116499090164173, "rouge1_recall_stderr": 0.0011261542628617407, "rouge2_fmeasure": 0.0018401062454758718, "rouge2_fmeasure_stderr": 0.00024113764717061088, "rouge2_precision": 0.0019691685829136708, "rouge2_precision_stderr": 0.00039941218802917803, "rouge2_recall": 0.0029104214883650358, "rouge2_recall_stderr": 0.0004428510365188812, "rougeL_fmeasure": 0.006456007937905347, "rougeL_fmeasure_stderr": 0.0006087192193596801, "rougeL_precision": 0.006782369156684581, "rougeL_precision_stderr": 0.0007517673138759126, "rougeL_recall": 0.009488752669874, "rougeL_recall_stderr": 0.0009294652225507753, "rougeLsum_fmeasure": 0.0075592314810126, "rougeLsum_fmeasure_stderr": 0.0007182399759145957, "rougeLsum_precision": 0.007964884621764316, "rougeLsum_precision_stderr": 0.0008803684865861808, "rougeLsum_recall": 0.010955431751909476, "rougeLsum_recall_stderr": 0.0010645735541727445}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.05917870550467971, "bleu_stderr": 0.021439430498213147, "rouge1_fmeasure": 0.01694591102577263, "rouge1_fmeasure_stderr": 0.00039478655670680056, "rouge1_precision": 0.013478744707580847, "rouge1_precision_stderr": 0.0003248531010059893, "rouge1_recall": 0.024336686884176808, "rouge1_recall_stderr": 0.0005936781998547862, "rouge2_fmeasure": 0.00033458548372763047, "rouge2_fmeasure_stderr": 0.00011052775646473782, "rouge2_precision": 0.00027175125175160587, "rouge2_precision_stderr": 8.832609141884472e-05, "rouge2_recall": 0.0004610068688759196, "rouge2_recall_stderr": 0.00015125719676224283, "rougeL_fmeasure": 0.01687494930775209, "rougeL_fmeasure_stderr": 0.00038076831927792644, "rougeL_precision": 0.013397548126384263, "rougeL_precision_stderr": 0.00030395901721982514, "rougeL_recall": 0.024264717187207113, "rougeL_recall_stderr": 0.0005819858820998335, "rougeLsum_fmeasure": 0.016153917938285457, "rougeLsum_fmeasure_stderr": 0.0003601396039880173, "rougeLsum_precision": 0.012843223489373723, "rougeLsum_precision_stderr": 0.00028966911671447516, "rougeLsum_recall": 0.023160611471050408, "rougeLsum_recall_stderr": 0.0005422110493582207}}, "1": {"generate_text_restaurant": {"bleu": 6.417831573052252, "bleu_stderr": 0.09933071128723041, "rouge1_fmeasure": 0.35535472148104835, "rouge1_fmeasure_stderr": 0.0022861724049496736, "rouge1_precision": 0.3270073065419512, "rouge1_precision_stderr": 0.0029410013606450264, "rouge1_recall": 0.45046398403254573, "rouge1_recall_stderr": 0.002823906899289483, "rouge2_fmeasure": 0.15117396956389217, "rouge2_fmeasure_stderr": 0.0016447464750995614, "rouge2_precision": 0.1401241410095008, "rouge2_precision_stderr": 0.0018531449241507667, "rouge2_recall": 0.19286536992009515, "rouge2_recall_stderr": 0.002090107731469081, "rougeL_fmeasure": 0.2718942580157423, "rougeL_fmeasure_stderr": 0.001650498917151088, "rougeL_precision": 0.24763224688897645, "rougeL_precision_stderr": 0.0021273893467661244, "rougeL_recall": 0.35088920807423885, "rougeL_recall_stderr": 0.0023694516955726544, "rougeLsum_fmeasure": 0.2914787807919549, "rougeLsum_fmeasure_stderr": 0.0021073074783888317, "rougeLsum_precision": 0.2683743408489297, "rougeLsum_precision_stderr": 0.002588898879750606, "rougeLsum_recall": 0.36958156522630725, "rougeLsum_recall_stderr": 0.0026484045078706137}}, "2": {"generate_text_restaurant": {"bleu": 7.144866575133128, "bleu_stderr": 0.11767976865436432, "rouge1_fmeasure": 0.3714682726190391, "rouge1_fmeasure_stderr": 0.002244670036020805, "rouge1_precision": 0.3363637768536738, "rouge1_precision_stderr": 0.0029826877514094984, "rouge1_recall": 0.47854404689146396, "rouge1_recall_stderr": 0.002723064462086883, "rouge2_fmeasure": 0.1689945125341207, "rouge2_fmeasure_stderr": 0.0016621353799920215, "rouge2_precision": 0.1536551921862161, "rouge2_precision_stderr": 0.0019146822480363551, "rouge2_recall": 0.22008266453179484, "rouge2_recall_stderr": 0.0021169213496211804, "rougeL_fmeasure": 0.28621488347173407, "rougeL_fmeasure_stderr": 0.0016530129285008005, "rougeL_precision": 0.2565245631891465, "rougeL_precision_stderr": 0.0021750263127523267, "rougeL_recall": 0.3751391147634627, "rougeL_recall_stderr": 0.002368430835477448, "rougeLsum_fmeasure": 0.30863451544091786, "rougeLsum_fmeasure_stderr": 0.0021207277891853935, "rougeLsum_precision": 0.27986713718242573, "rougeLsum_precision_stderr": 0.0026718889235212745, "rougeLsum_recall": 0.39712567866655407, "rougeLsum_recall_stderr": 0.0026000778308341633}}, "3": {"generate_text_restaurant": {"bleu": 8.499047110531965, "bleu_stderr": 0.13158189889347094, "rouge1_fmeasure": 0.40166818643818236, "rouge1_fmeasure_stderr": 0.002314516219733783, "rouge1_precision": 0.39367247241289555, "rouge1_precision_stderr": 0.003355140132833485, "rouge1_recall": 0.4733493228069879, "rouge1_recall_stderr": 0.0027399128984850938, "rouge2_fmeasure": 0.18929612249057426, "rouge2_fmeasure_stderr": 0.001820291860246511, "rouge2_precision": 0.18739668231963508, "rouge2_precision_stderr": 0.0022894313415515234, "rouge2_recall": 0.22436666999546287, "rouge2_recall_stderr": 0.0021578233400330656, "rougeL_fmeasure": 0.304941258359424, "rougeL_fmeasure_stderr": 0.0018519277760080438, "rougeL_precision": 0.29709709963337616, "rougeL_precision_stderr": 0.0026267440040518976, "rougeL_recall": 0.363645752674414, "rougeL_recall_stderr": 0.0023895292399462764, "rougeLsum_fmeasure": 0.3366450375434392, "rougeLsum_fmeasure_stderr": 0.0022248167592229763, "rougeLsum_precision": 0.3298766332093264, "rougeLsum_precision_stderr": 0.0030284038518238743, "rougeLsum_recall": 0.39696017152039725, "rougeLsum_recall_stderr": 0.002640555678033001}}, "4": {"generate_text_restaurant": {"bleu": 8.977345314148437, "bleu_stderr": 0.136331232899511, "rouge1_fmeasure": 0.4137218280270365, "rouge1_fmeasure_stderr": 0.00229192558316451, "rouge1_precision": 0.42219539407433715, "rouge1_precision_stderr": 0.003468290252983904, "rouge1_recall": 0.46659038237235495, "rouge1_recall_stderr": 0.0026745023056816263, "rouge2_fmeasure": 0.1962921120031527, "rouge2_fmeasure_stderr": 0.0018551125349853047, "rouge2_precision": 0.20320839450126849, "rouge2_precision_stderr": 0.002422458324324043, "rouge2_recall": 0.2218045628736145, "rouge2_recall_stderr": 0.0021361331838705595, "rougeL_fmeasure": 0.31149638577874117, "rougeL_fmeasure_stderr": 0.0018866947594172608, "rougeL_precision": 0.31691055753990927, "rougeL_precision_stderr": 0.002796255165982753, "rougeL_recall": 0.35474356867721046, "rougeL_recall_stderr": 0.0023597567616977237, "rougeLsum_fmeasure": 0.3480299709780515, "rougeLsum_fmeasure_stderr": 0.002215355607264788, "rougeLsum_precision": 0.3549635366471692, "rougeLsum_precision_stderr": 0.003154853335977498, "rougeLsum_recall": 0.39337220173064513, "rougeLsum_recall_stderr": 0.0026226330676984337}}, "5": {"generate_text_restaurant": {"bleu": 9.100746043108366, "bleu_stderr": 0.13904428510247194, "rouge1_fmeasure": 0.416889541669577, "rouge1_fmeasure_stderr": 0.0022593173429326734, "rouge1_precision": 0.43164416574321207, "rouge1_precision_stderr": 0.0035247087906401533, "rouge1_recall": 0.46393300365544937, "rouge1_recall_stderr": 0.0026390992087383935, "rouge2_fmeasure": 0.19877083959875572, "rouge2_fmeasure_stderr": 0.001838457868273613, "rouge2_precision": 0.20893177101713536, "rouge2_precision_stderr": 0.002479440253245222, "rouge2_recall": 0.22138470377348013, "rouge2_recall_stderr": 0.0020735347416350257, "rougeL_fmeasure": 0.3141171543495437, "rougeL_fmeasure_stderr": 0.0018865940368174637, "rougeL_precision": 0.32413833650774926, "rougeL_precision_stderr": 0.002847430657050952, "rougeL_recall": 0.3525844418983698, "rougeL_recall_stderr": 0.002316048099912866, "rougeLsum_fmeasure": 0.3534290998927807, "rougeLsum_fmeasure_stderr": 0.002214363788536545, "rougeLsum_precision": 0.3657039327671768, "rougeLsum_precision_stderr": 0.003231306179751167, "rougeLsum_recall": 0.3941170576961099, "rougeLsum_recall_stderr": 0.0026158137795935115}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7357251912952043, "bleu_stderr": 0.07015285777750578, "rouge1_fmeasure": 0.19919382153347004, "rouge1_fmeasure_stderr": 0.0025010328697608777, "rouge1_precision": 0.14862724178358616, "rouge1_precision_stderr": 0.0021900107332745392, "rouge1_recall": 0.3337541841058178, "rouge1_recall_stderr": 0.004265810055674235, "rouge2_fmeasure": 0.04301195793696945, "rouge2_fmeasure_stderr": 0.0015237839114813907, "rouge2_precision": 0.031624579711508685, "rouge2_precision_stderr": 0.0011555149143629379, "rouge2_recall": 0.07441377803552965, "rouge2_recall_stderr": 0.0027141967472220536, "rougeL_fmeasure": 0.15205737262318364, "rougeL_fmeasure_stderr": 0.0018771623450897423, "rougeL_precision": 0.11312850293639194, "rougeL_precision_stderr": 0.0015990209337972076, "rougeL_recall": 0.256292931336433, "rougeL_recall_stderr": 0.0033716170663084088, "rougeLsum_fmeasure": 0.15548487723181087, "rougeLsum_fmeasure_stderr": 0.0020818473038173367, "rougeLsum_precision": 0.1155133946544348, "rougeLsum_precision_stderr": 0.0017187641613891761, "rougeLsum_recall": 0.26230266384787015, "rougeLsum_recall_stderr": 0.0037125025121796515}}, "1": {"article_DOC_summary": {"bleu": 1.142797381411138, "bleu_stderr": 0.06382932547893247, "rouge1_fmeasure": 0.1656700490248908, "rouge1_fmeasure_stderr": 0.002300335199398068, "rouge1_precision": 0.11756548621396466, "rouge1_precision_stderr": 0.0017039245368348878, "rouge1_recall": 0.29257190771342717, "rouge1_recall_stderr": 0.0040111596267870175, "rouge2_fmeasure": 0.02999018501144029, "rouge2_fmeasure_stderr": 0.0012790241082206767, "rouge2_precision": 0.02106853393218671, "rouge2_precision_stderr": 0.0009002910576438377, "rouge2_recall": 0.054432952758428556, "rouge2_recall_stderr": 0.0023912713425260777, "rougeL_fmeasure": 0.12980281352012504, "rougeL_fmeasure_stderr": 0.0017097107245796766, "rougeL_precision": 0.0919219200234561, "rougeL_precision_stderr": 0.0012562375365408045, "rougeL_recall": 0.23071754465962444, "rougeL_recall_stderr": 0.0031261480109302502, "rougeLsum_fmeasure": 0.13162835727493158, "rougeLsum_fmeasure_stderr": 0.0018618813809922038, "rougeLsum_precision": 0.09316913757751522, "rougeLsum_precision_stderr": 0.001359452910537994, "rougeLsum_recall": 0.23414449790551817, "rougeLsum_recall_stderr": 0.0033914459404180363}}, "2": {"article_DOC_summary": {"bleu": 1.203193105589264, "bleu_stderr": 0.09357359220023674, "rouge1_fmeasure": 0.1664869894922336, "rouge1_fmeasure_stderr": 0.0024048311165237655, "rouge1_precision": 0.11809628667008427, "rouge1_precision_stderr": 0.0017819579555278797, "rouge1_recall": 0.29415562750545377, "rouge1_recall_stderr": 0.004175428217745066, "rouge2_fmeasure": 0.031359702416680915, "rouge2_fmeasure_stderr": 0.0013246131420218, "rouge2_precision": 0.02202415396875264, "rouge2_precision_stderr": 0.0009312900773519731, "rouge2_recall": 0.056832176069842685, "rouge2_recall_stderr": 0.002470027826249069, "rougeL_fmeasure": 0.13114255284856108, "rougeL_fmeasure_stderr": 0.0017938136938261508, "rougeL_precision": 0.09285389775840713, "rougeL_precision_stderr": 0.0013172320798947644, "rougeL_recall": 0.2330228822661898, "rougeL_recall_stderr": 0.0032509542493564525, "rougeLsum_fmeasure": 0.13288323149460116, "rougeLsum_fmeasure_stderr": 0.001971045136299284, "rougeLsum_precision": 0.09402908997291087, "rougeLsum_precision_stderr": 0.0014407752934802245, "rougeLsum_recall": 0.23647906078908015, "rougeLsum_recall_stderr": 0.003566059966771714}}, "3": {"article_DOC_summary": {"bleu": 1.235138294709963, "bleu_stderr": 0.10805024020647826, "rouge1_fmeasure": 0.16011355121062382, "rouge1_fmeasure_stderr": 0.002459993071724102, "rouge1_precision": 0.11637369901242395, "rouge1_precision_stderr": 0.001955543886976224, "rouge1_recall": 0.27761487556444325, "rouge1_recall_stderr": 0.004212942254103512, "rouge2_fmeasure": 0.02981894291789061, "rouge2_fmeasure_stderr": 0.0012838368753906573, "rouge2_precision": 0.02144033178353626, "rouge2_precision_stderr": 0.0009761863694245551, "rouge2_recall": 0.05314245590833923, "rouge2_recall_stderr": 0.0023319835959942714, "rougeL_fmeasure": 0.1274345805400011, "rougeL_fmeasure_stderr": 0.0018877850754460694, "rougeL_precision": 0.09259941059811318, "rougeL_precision_stderr": 0.0015273565883123174, "rougeL_recall": 0.22244331428762865, "rougeL_recall_stderr": 0.0033645133626818632, "rougeLsum_fmeasure": 0.12931560711483042, "rougeLsum_fmeasure_stderr": 0.002039943837785206, "rougeLsum_precision": 0.09379953891916527, "rougeLsum_precision_stderr": 0.001614111920018084, "rougeLsum_recall": 0.226162619913405, "rougeLsum_recall_stderr": 0.00364702525536272}}, "4": {"article_DOC_summary": {"bleu": 0.7124937798594313, "bleu_stderr": 0.1416893425661407, "rouge1_fmeasure": 0.04529575409576912, "rouge1_fmeasure_stderr": 0.0026096787087263908, "rouge1_precision": 0.038521021782917324, "rouge1_precision_stderr": 0.002579239954607112, "rouge1_recall": 0.07087374534410745, "rouge1_recall_stderr": 0.004075178170838202, "rouge2_fmeasure": 0.00898850210998665, "rouge2_fmeasure_stderr": 0.00100077562757513, "rouge2_precision": 0.00799125319204421, "rouge2_precision_stderr": 0.0011552604497687927, "rouge2_recall": 0.013998796100064619, "rouge2_recall_stderr": 0.0014402265840230781, "rougeL_fmeasure": 0.03594504302331302, "rougeL_fmeasure_stderr": 0.002040260108424175, "rougeL_precision": 0.030913190179522012, "rougeL_precision_stderr": 0.0021063563387628396, "rougeL_recall": 0.0564873557506103, "rougeL_recall_stderr": 0.003208770140638667, "rougeLsum_fmeasure": 0.037016014691665926, "rougeLsum_fmeasure_stderr": 0.0021672607800995647, "rougeLsum_precision": 0.03178988261987462, "rougeLsum_precision_stderr": 0.0021919754292872226, "rougeLsum_recall": 0.057998197004667344, "rougeLsum_recall_stderr": 0.0033846296530289487}}, "5": {"article_DOC_summary": {"bleu": 7.929891497056582e-37, "bleu_stderr": 1.9093989194927428e-31, "rouge1_fmeasure": 0.0023650421490574954, "rouge1_fmeasure_stderr": 0.0006208504370497826, "rouge1_precision": 0.002546465232296764, "rouge1_precision_stderr": 0.0006775269263162023, "rouge1_recall": 0.0023177367033083425, "rouge1_recall_stderr": 0.0006142608858766371, "rouge2_fmeasure": 0.000236662827910983, "rouge2_fmeasure_stderr": 0.00016598149457993803, "rouge2_precision": 0.0002438368143140618, "rouge2_precision_stderr": 0.0001596639621589295, "rouge2_recall": 0.00023881778598759733, "rouge2_recall_stderr": 0.00017794531602779935, "rougeL_fmeasure": 0.001834154717384653, "rougeL_fmeasure_stderr": 0.0004870383066728937, "rougeL_precision": 0.0019812215307717387, "rougeL_precision_stderr": 0.0005291032639085791, "rougeL_recall": 0.001781631282478878, "rougeL_recall_stderr": 0.0004773990716078782, "rougeLsum_fmeasure": 0.0019382435007187192, "rougeLsum_fmeasure_stderr": 0.0005100160485522995, "rougeLsum_precision": 0.002076514078894475, "rougeLsum_precision_stderr": 0.0005498540873240215, "rougeLsum_recall": 0.001920599581824536, "rougeLsum_recall_stderr": 0.0005186953029882198}}}} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_0.csv b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..89d5e0e45735a097878ff94a9a9a4cb1e3dfb0fa --- /dev/null +++ b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732963,0 +anli_r2,acc,0.339,0.014976758771620345,0 +anli_r3,acc,0.3308333333333333,0.013588208070709,0 +arc_challenge,acc,0.25170648464163825,0.012682496334042961,0 +arc_challenge,acc_norm,0.2781569965870307,0.013094469919538805,0 +arc_easy,acc,0.5631313131313131,0.010177672928157685,0 +arc_easy,acc_norm,0.5046296296296297,0.01025934370588972,0 +boolq,acc,0.6217125382262997,0.008482001133930998,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.1940928270042194,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4382593108942442,0.00495159406327205,0 +hellaswag,acc_norm,0.5646285600477993,0.004947922692688843,0 +piqa,acc,0.736126224156692,0.010282996367695562,0 +piqa,acc_norm,0.7437431991294886,0.01018578783156504,0 +rte,acc,0.5270758122743683,0.030052303463143706,0 +sciq,acc,0.82,0.012155153135511965,0 +sciq,acc_norm,0.741,0.01386041525752791,0 +storycloze_2016,acc,0.6873329770176376,0.01072022317295317,0 +winogrande,acc,0.5398579321231255,0.014007765428365163,0 diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json deleted file mode 100644 index 653a3297924951aa77461808b153cbe852f4b4d1..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.014976758771620345 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.1940928270042194 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4382593108942442, - "acc_stderr": 0.00495159406327205, - "acc_norm": 0.5646285600477993, - "acc_norm_stderr": 0.004947922692688843 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.5398579321231255, - "acc_stderr": 0.014007765428365163 - }, - "storycloze_2016": { - "acc": 0.6873329770176376, - "acc_stderr": 0.01072022317295317 - }, - "boolq": { - "acc": 0.6217125382262997, - "acc_stderr": 0.008482001133930998 - }, - "arc_easy": { - "acc": 0.5631313131313131, - "acc_stderr": 0.010177672928157685, - "acc_norm": 0.5046296296296297, - "acc_norm_stderr": 0.01025934370588972 - }, - "arc_challenge": { - "acc": 0.25170648464163825, - "acc_stderr": 0.012682496334042961, - "acc_norm": 0.2781569965870307, - "acc_norm_stderr": 0.013094469919538805 - }, - "sciq": { - "acc": 0.82, - "acc_stderr": 0.012155153135511965, - "acc_norm": 0.741, - "acc_norm_stderr": 0.01386041525752791 - }, - "piqa": { - "acc": 0.736126224156692, - "acc_stderr": 0.010282996367695562, - "acc_norm": 0.7437431991294886, - "acc_norm_stderr": 0.01018578783156504 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_1.csv b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..65b1f44c971670ed0c40c1315283b849e1ec56ba --- /dev/null +++ b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.351,0.015100563798316405,0 +anli_r2,acc,0.337,0.014955087918653602,0 +anli_r3,acc,0.36,0.013862183574189906,0 +arc_challenge,acc,0.26706484641638223,0.01292893319649636,0 +arc_challenge,acc_norm,0.27047781569965873,0.012980954547659554,0 +arc_easy,acc,0.577020202020202,0.010137328382209099,0 +arc_easy,acc_norm,0.5437710437710438,0.010220394383722024,0 +boolq,acc,0.5865443425076453,0.00861305923994264,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.26455026455026454,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4320852419836686,0.0049435372423444176,0 +hellaswag,acc_norm,0.5631348336984664,0.004949842967331437,0 +piqa,acc,0.7377584330794341,0.010262502565172449,0 +piqa,acc_norm,0.7366702937976061,0.010276185322196764,0 +rte,acc,0.5595667870036101,0.029882123363118712,0 +sciq,acc,0.876,0.010427498872343961,0 +sciq,acc_norm,0.847,0.011389500459665537,0 +storycloze_2016,acc,0.677712453233565,0.010807461374996358,0 +winogrande,acc,0.5359116022099447,0.014016193433958308,0 diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json deleted file mode 100644 index 7957ab1f65b02179bf2699f2423b79fa80f6abc5..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316405 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653602 - }, - "anli_r3": { - "acc": 0.36, - "acc_stderr": 0.013862183574189906 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.26455026455026454 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4320852419836686, - "acc_stderr": 0.0049435372423444176, - "acc_norm": 0.5631348336984664, - "acc_norm_stderr": 0.004949842967331437 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118712 - }, - "winogrande": { - "acc": 0.5359116022099447, - "acc_stderr": 0.014016193433958308 - }, - "storycloze_2016": { - "acc": 0.677712453233565, - "acc_stderr": 0.010807461374996358 - }, - "boolq": { - "acc": 0.5865443425076453, - "acc_stderr": 0.00861305923994264 - }, - "arc_easy": { - "acc": 0.577020202020202, - "acc_stderr": 0.010137328382209099, - "acc_norm": 0.5437710437710438, - "acc_norm_stderr": 0.010220394383722024 - }, - "arc_challenge": { - "acc": 0.26706484641638223, - "acc_stderr": 0.01292893319649636, - "acc_norm": 0.27047781569965873, - "acc_norm_stderr": 0.012980954547659554 - }, - "sciq": { - "acc": 0.876, - "acc_stderr": 0.010427498872343961, - "acc_norm": 0.847, - "acc_norm_stderr": 0.011389500459665537 - }, - "piqa": { - "acc": 0.7377584330794341, - "acc_stderr": 0.010262502565172449, - "acc_norm": 0.7366702937976061, - "acc_norm_stderr": 0.010276185322196764 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_2.csv b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..0fb26c69b96d9b250ee05e132b9e66b8adbe612f --- /dev/null +++ b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928364,0 +anli_r2,acc,0.333,0.01491084616422986,0 +anli_r3,acc,0.31166666666666665,0.013376268790982089,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378606,0 +arc_challenge,acc_norm,0.2858361774744027,0.013203196088537369,0 +arc_easy,acc,0.5803872053872053,0.010126315840891536,0 +arc_easy,acc_norm,0.5673400673400674,0.010166307932642874,0 +boolq,acc,0.5886850152905199,0.008606395426309208,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.29526609377355645,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.43089026090420235,0.004941887610849041,0 +hellaswag,acc_norm,0.5630352519418442,0.004949969363017659,0 +piqa,acc,0.7317736670293797,0.010336761992404485,0 +piqa,acc_norm,0.7442872687704026,0.01017869010945988,0 +rte,acc,0.5667870036101083,0.029826764082138277,0 +sciq,acc,0.893,0.009779910359847169,0 +sciq,acc_norm,0.877,0.010391293421849877,0 +storycloze_2016,acc,0.686798503474078,0.010725209422929401,0 +winogrande,acc,0.5540647198105761,0.013970093482330694,0 diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json deleted file mode 100644 index 5c8c05d68b2428db1743f06092842ae517d9995f..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928364 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.01491084616422986 - }, - "anli_r3": { - "acc": 0.31166666666666665, - "acc_stderr": 0.013376268790982089 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.29526609377355645 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.43089026090420235, - "acc_stderr": 0.004941887610849041, - "acc_norm": 0.5630352519418442, - "acc_norm_stderr": 0.004949969363017659 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.029826764082138277 - }, - "winogrande": { - "acc": 0.5540647198105761, - "acc_stderr": 0.013970093482330694 - }, - "storycloze_2016": { - "acc": 0.686798503474078, - "acc_stderr": 0.010725209422929401 - }, - "boolq": { - "acc": 0.5886850152905199, - "acc_stderr": 0.008606395426309208 - }, - "arc_easy": { - "acc": 0.5803872053872053, - "acc_stderr": 0.010126315840891536, - "acc_norm": 0.5673400673400674, - "acc_norm_stderr": 0.010166307932642874 - }, - "arc_challenge": { - "acc": 0.25341296928327645, - "acc_stderr": 0.012710896778378606, - "acc_norm": 0.2858361774744027, - "acc_norm_stderr": 0.013203196088537369 - }, - "sciq": { - "acc": 0.893, - "acc_stderr": 0.009779910359847169, - "acc_norm": 0.877, - "acc_norm_stderr": 0.010391293421849877 - }, - "piqa": { - "acc": 0.7317736670293797, - "acc_stderr": 0.010336761992404485, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.01017869010945988 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_3.csv b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..a0ca1f1c12edf718bdda3b7112171d40eb233e81 --- /dev/null +++ b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.338,0.014965960710224487,0 +anli_r2,acc,0.352,0.01511040450564867,0 +anli_r3,acc,0.3416666666666667,0.013696658778002505,0 +arc_challenge,acc,0.2593856655290102,0.012808273573927104,0 +arc_challenge,acc_norm,0.28668941979522183,0.013214986329274776,0 +arc_easy,acc,0.5774410774410774,0.010135978222981077,0 +arc_easy,acc_norm,0.5627104377104377,0.010178768429321586,0 +boolq,acc,0.581651376146789,0.00862766139082541,1 +cb,acc,0.32142857142857145,0.06297362289056341,1 +cb,f1,0.19607843137254902,,1 +copa,acc,0.77,0.042295258468165065,0 +hellaswag,acc,0.4305915156343358,0.004941470620074864,0 +hellaswag,acc_norm,0.5668193586934873,0.004945023657032274,0 +piqa,acc,0.7377584330794341,0.010262502565172449,0 +piqa,acc_norm,0.7442872687704026,0.01017869010945988,0 +rte,acc,0.555956678700361,0.029907396333795994,0 +sciq,acc,0.894,0.00973955126578513,0 +sciq,acc_norm,0.88,0.010281328012747391,0 +storycloze_2016,acc,0.6809192944949225,0.010778970635312498,0 +winogrande,acc,0.5398579321231255,0.014007765428365168,0 diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json deleted file mode 100644 index 36dd8a436a15171574d82b012e7f61b905265889..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.014965960710224487 - }, - "anli_r2": { - "acc": 0.352, - "acc_stderr": 0.01511040450564867 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002505 - }, - "cb": { - "acc": 0.32142857142857145, - "acc_stderr": 0.06297362289056341, - "f1": 0.19607843137254902 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.042295258468165065 - }, - "hellaswag": { - "acc": 0.4305915156343358, - "acc_stderr": 0.004941470620074864, - "acc_norm": 0.5668193586934873, - "acc_norm_stderr": 0.004945023657032274 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795994 - }, - "winogrande": { - "acc": 0.5398579321231255, - "acc_stderr": 0.014007765428365168 - }, - "storycloze_2016": { - "acc": 0.6809192944949225, - "acc_stderr": 0.010778970635312498 - }, - "boolq": { - "acc": 0.581651376146789, - "acc_stderr": 0.00862766139082541 - }, - "arc_easy": { - "acc": 0.5774410774410774, - "acc_stderr": 0.010135978222981077, - "acc_norm": 0.5627104377104377, - "acc_norm_stderr": 0.010178768429321586 - }, - "arc_challenge": { - "acc": 0.2593856655290102, - "acc_stderr": 0.012808273573927104, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.013214986329274776 - }, - "sciq": { - "acc": 0.894, - "acc_stderr": 0.00973955126578513, - "acc_norm": 0.88, - "acc_norm_stderr": 0.010281328012747391 - }, - "piqa": { - "acc": 0.7377584330794341, - "acc_stderr": 0.010262502565172449, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.01017869010945988 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_4.csv b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..9769988db1b505201ac5fe2f05657b03ac0506b0 --- /dev/null +++ b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.345,0.015039986742055235,0 +anli_r2,acc,0.333,0.014910846164229868,0 +anli_r3,acc,0.3375,0.013655897185463657,0 +arc_challenge,acc,0.2568259385665529,0.012766923794116796,0 +arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0 +arc_easy,acc,0.5883838383838383,0.01009821864671491,0 +arc_easy,acc_norm,0.5660774410774411,0.010169795770462103,0 +boolq,acc,0.5844036697247706,0.008619555273337565,1 +cb,acc,0.44642857142857145,0.067031892279424,1 +cb,f1,0.35095948827292106,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.42939653455486954,0.004939784311448986,0 +hellaswag,acc_norm,0.5685122485560645,0.004942716091996087,0 +piqa,acc,0.7323177366702938,0.01033011118937043,0 +piqa,acc_norm,0.7393906420021763,0.010241826155811633,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.901,0.009449248027662751,0 +sciq,acc_norm,0.887,0.01001655286669685,0 +storycloze_2016,acc,0.6910742918225548,0.010684853966268454,0 +winogrande,acc,0.5595895816890292,0.013952330311915615,0 diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json deleted file mode 100644 index f5d4672f43f4473c789e1a4c68ebe99aae9ef201..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.345, - "acc_stderr": 0.015039986742055235 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229868 - }, - "anli_r3": { - "acc": 0.3375, - "acc_stderr": 0.013655897185463657 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.067031892279424, - "f1": 0.35095948827292106 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.42939653455486954, - "acc_stderr": 0.004939784311448986, - "acc_norm": 0.5685122485560645, - "acc_norm_stderr": 0.004942716091996087 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5595895816890292, - "acc_stderr": 0.013952330311915615 - }, - "storycloze_2016": { - "acc": 0.6910742918225548, - "acc_stderr": 0.010684853966268454 - }, - "boolq": { - "acc": 0.5844036697247706, - "acc_stderr": 0.008619555273337565 - }, - "arc_easy": { - "acc": 0.5883838383838383, - "acc_stderr": 0.01009821864671491, - "acc_norm": 0.5660774410774411, - "acc_norm_stderr": 0.010169795770462103 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.012766923794116796, - "acc_norm": 0.2832764505119454, - "acc_norm_stderr": 0.013167478735134575 - }, - "sciq": { - "acc": 0.901, - "acc_stderr": 0.009449248027662751, - "acc_norm": 0.887, - "acc_norm_stderr": 0.01001655286669685 - }, - "piqa": { - "acc": 0.7323177366702938, - "acc_stderr": 0.01033011118937043, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.010241826155811633 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_5.csv b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..1b39474e2faa7e58be2826d5b985cddcbe0d40a8 --- /dev/null +++ b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620345,0 +anli_r2,acc,0.338,0.014965960710224492,0 +anli_r3,acc,0.3258333333333333,0.013535422043417455,0 +arc_challenge,acc,0.2645051194539249,0.012889272949313366,0 +arc_challenge,acc_norm,0.28498293515358364,0.013191348179838793,0 +arc_easy,acc,0.5867003367003367,0.010104361780747516,0 +arc_easy,acc_norm,0.5656565656565656,0.010170943451269421,0 +boolq,acc,0.5837920489296636,0.008621380519419273,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.22319093286835223,,1 +copa,acc,0.76,0.042923469599092816,0 +hellaswag,acc,0.42850029874526985,0.0049385003039902915,0 +hellaswag,acc_norm,0.5700059749053973,0.004940631135803532,0 +piqa,acc,0.7301414581066377,0.010356595421852202,0 +piqa,acc_norm,0.7421109902067464,0.01020695666205624,0 +rte,acc,0.628158844765343,0.029091018492217437,0 +sciq,acc,0.901,0.00944924802766273,0 +sciq,acc_norm,0.886,0.01005510343582333,0 +storycloze_2016,acc,0.6878674505611972,0.010715220346279683,0 +winogrande,acc,0.5438042620363063,0.013998453610924326,0 diff --git a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json b/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json deleted file mode 100644 index 3aaaed275a33e614786d5ebdc1bc3de27ecfcf8a..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed3/evaluation/rankeval/2b855b18bc4seed3_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620345 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224492 - }, - "anli_r3": { - "acc": 0.3258333333333333, - "acc_stderr": 0.013535422043417455 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.22319093286835223 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.042923469599092816 - }, - "hellaswag": { - "acc": 0.42850029874526985, - "acc_stderr": 0.0049385003039902915, - "acc_norm": 0.5700059749053973, - "acc_norm_stderr": 0.004940631135803532 - }, - "rte": { - "acc": 0.628158844765343, - "acc_stderr": 0.029091018492217437 - }, - "winogrande": { - "acc": 0.5438042620363063, - "acc_stderr": 0.013998453610924326 - }, - "storycloze_2016": { - "acc": 0.6878674505611972, - "acc_stderr": 0.010715220346279683 - }, - "boolq": { - "acc": 0.5837920489296636, - "acc_stderr": 0.008621380519419273 - }, - "arc_easy": { - "acc": 0.5867003367003367, - "acc_stderr": 0.010104361780747516, - "acc_norm": 0.5656565656565656, - "acc_norm_stderr": 0.010170943451269421 - }, - "arc_challenge": { - "acc": 0.2645051194539249, - "acc_stderr": 0.012889272949313366, - "acc_norm": 0.28498293515358364, - "acc_norm_stderr": 0.013191348179838793 - }, - "sciq": { - "acc": 0.901, - "acc_stderr": 0.00944924802766273, - "acc_norm": 0.886, - "acc_norm_stderr": 0.01005510343582333 - }, - "piqa": { - "acc": 0.7301414581066377, - "acc_stderr": 0.010356595421852202, - "acc_norm": 0.7421109902067464, - "acc_norm_stderr": 0.01020695666205624 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_0.csv b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..e80b2e9903586979766e2c6a5877cd452aaf7cdd --- /dev/null +++ b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928367,0 +anli_r2,acc,0.341,0.0149981313484027,0 +anli_r3,acc,0.33916666666666667,0.013672343491681812,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207706,0 +arc_challenge,acc_norm,0.2713310580204778,0.012993807727545794,0 +arc_easy,acc,0.5660774410774411,0.01016979577046211,0 +arc_easy,acc_norm,0.5067340067340067,0.010258852980991825,0 +boolq,acc,0.6,0.008568368985904955,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.25778915919760986,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4372634933280223,0.004950347333701827,0 +hellaswag,acc_norm,0.5640310695080661,0.004948696280312417,0 +piqa,acc,0.7372143634385201,0.010269354068140765,0 +piqa,acc_norm,0.7421109902067464,0.010206956662056245,0 +rte,acc,0.5595667870036101,0.02988212336311872,0 +sciq,acc,0.809,0.012436787112179487,0 +sciq,acc_norm,0.714,0.01429714686251791,0 +storycloze_2016,acc,0.6884019241047569,0.010710200919679799,0 +winogrande,acc,0.5682715074980268,0.013920872110010708,0 diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index c0747d63e288a23255897211aedb3d47e3cf03d7..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928367 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.0149981313484027 - }, - "anli_r3": { - "acc": 0.33916666666666667, - "acc_stderr": 0.013672343491681812 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.25778915919760986 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4372634933280223, - "acc_stderr": 0.004950347333701827, - "acc_norm": 0.5640310695080661, - "acc_norm_stderr": 0.004948696280312417 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.02988212336311872 - }, - "winogrande": { - "acc": 0.5682715074980268, - "acc_stderr": 0.013920872110010708 - }, - "storycloze_2016": { - "acc": 0.6884019241047569, - "acc_stderr": 0.010710200919679799 - }, - "boolq": { - "acc": 0.6, - "acc_stderr": 0.008568368985904955 - }, - "arc_easy": { - "acc": 0.5660774410774411, - "acc_stderr": 0.01016979577046211, - "acc_norm": 0.5067340067340067, - "acc_norm_stderr": 0.010258852980991825 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207706, - "acc_norm": 0.2713310580204778, - "acc_norm_stderr": 0.012993807727545794 - }, - "sciq": { - "acc": 0.809, - "acc_stderr": 0.012436787112179487, - "acc_norm": 0.714, - "acc_norm_stderr": 0.01429714686251791 - }, - "piqa": { - "acc": 0.7372143634385201, - "acc_stderr": 0.010269354068140765, - "acc_norm": 0.7421109902067464, - "acc_norm_stderr": 0.010206956662056245 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_1.csv b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..850c54fb4ab20a1993a1d479579bbc9c5cb161c6 --- /dev/null +++ b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.015050266127564445,0 +anli_r2,acc,0.364,0.015222868840522024,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.2721843003412969,0.013006600406423709,0 +arc_easy,acc,0.5845959595959596,0.010111869494911517,0 +arc_easy,acc_norm,0.5458754208754208,0.010216507710244106,0 +boolq,acc,0.5972477064220183,0.008578054401368405,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.37124414983384657,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.43248356901015733,0.004944080605048777,0 +hellaswag,acc_norm,0.5643298147779326,0.004948310399746082,0 +piqa,acc,0.735038084874864,0.010296557993316047,0 +piqa,acc_norm,0.7377584330794341,0.01026250256517244,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.849,0.011328165223341671,0 +sciq,acc_norm,0.82,0.012155153135511961,0 +storycloze_2016,acc,0.6819882415820417,0.010769343495248544,0 +winogrande,acc,0.5698500394632992,0.0139146850947167,0 diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index 3bcdaab1e35ec0d1bde1d1899d7003f729e3602b..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.015050266127564445 - }, - "anli_r2": { - "acc": 0.364, - "acc_stderr": 0.015222868840522024 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.37124414983384657 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.43248356901015733, - "acc_stderr": 0.004944080605048777, - "acc_norm": 0.5643298147779326, - "acc_norm_stderr": 0.004948310399746082 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5698500394632992, - "acc_stderr": 0.0139146850947167 - }, - "storycloze_2016": { - "acc": 0.6819882415820417, - "acc_stderr": 0.010769343495248544 - }, - "boolq": { - "acc": 0.5972477064220183, - "acc_stderr": 0.008578054401368405 - }, - "arc_easy": { - "acc": 0.5845959595959596, - "acc_stderr": 0.010111869494911517, - "acc_norm": 0.5458754208754208, - "acc_norm_stderr": 0.010216507710244106 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.2721843003412969, - "acc_norm_stderr": 0.013006600406423709 - }, - "sciq": { - "acc": 0.849, - "acc_stderr": 0.011328165223341671, - "acc_norm": 0.82, - "acc_norm_stderr": 0.012155153135511961 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316047, - "acc_norm": 0.7377584330794341, - "acc_norm_stderr": 0.01026250256517244 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_2.csv b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..f795916f8eea02876ef48e0151d702ca89750552 --- /dev/null +++ b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.015050266127564452,0 +anli_r2,acc,0.342,0.015008706182121731,0 +anli_r3,acc,0.3233333333333333,0.01350837286730022,0 +arc_challenge,acc,0.2508532423208191,0.01266819862131543,0 +arc_challenge,acc_norm,0.27303754266211605,0.013019332762635737,0 +arc_easy,acc,0.5984848484848485,0.010058790020755567,0 +arc_easy,acc_norm,0.5723905723905723,0.010151683397430673,0 +boolq,acc,0.591131498470948,0.008598573693259108,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.25420875420875416,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4314877514439355,0.004942716091996085,0 +hellaswag,acc_norm,0.5613423620792671,0.004952087083128898,0 +piqa,acc,0.7279651795429815,0.01038276378624738,0 +piqa,acc_norm,0.7404787812840044,0.010227939888173925,0 +rte,acc,0.49458483754512633,0.030094698123239966,0 +sciq,acc,0.877,0.010391293421849876,0 +sciq,acc_norm,0.851,0.011266140684632178,0 +storycloze_2016,acc,0.6905398182789952,0.010689956745189069,0 +winogrande,acc,0.5864246250986582,0.013840971763195301,0 diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index 06cccb04ab0b450ce4b64506854591ae7f928e99..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.015050266127564452 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.01350837286730022 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.25420875420875416 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4314877514439355, - "acc_stderr": 0.004942716091996085, - "acc_norm": 0.5613423620792671, - "acc_norm_stderr": 0.004952087083128898 - }, - "rte": { - "acc": 0.49458483754512633, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5864246250986582, - "acc_stderr": 0.013840971763195301 - }, - "storycloze_2016": { - "acc": 0.6905398182789952, - "acc_stderr": 0.010689956745189069 - }, - "boolq": { - "acc": 0.591131498470948, - "acc_stderr": 0.008598573693259108 - }, - "arc_easy": { - "acc": 0.5984848484848485, - "acc_stderr": 0.010058790020755567, - "acc_norm": 0.5723905723905723, - "acc_norm_stderr": 0.010151683397430673 - }, - "arc_challenge": { - "acc": 0.2508532423208191, - "acc_stderr": 0.01266819862131543, - "acc_norm": 0.27303754266211605, - "acc_norm_stderr": 0.013019332762635737 - }, - "sciq": { - "acc": 0.877, - "acc_stderr": 0.010391293421849876, - "acc_norm": 0.851, - "acc_norm_stderr": 0.011266140684632178 - }, - "piqa": { - "acc": 0.7279651795429815, - "acc_stderr": 0.01038276378624738, - "acc_norm": 0.7404787812840044, - "acc_norm_stderr": 0.010227939888173925 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_3.csv b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..09e5c2d2f4f71e6da0352af4f9cd3a513201ef53 --- /dev/null +++ b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057121,0 +anli_r2,acc,0.34,0.014987482264363937,0 +anli_r3,acc,0.35583333333333333,0.013826518748493312,0 +arc_challenge,acc,0.2525597269624573,0.012696728980207706,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975586,0 +arc_easy,acc,0.5968013468013468,0.010065668576794801,0 +arc_easy,acc_norm,0.5732323232323232,0.010149141043955635,0 +boolq,acc,0.5914373088685015,0.008597580502718662,1 +cb,acc,0.39285714285714285,0.06585388898066351,1 +cb,f1,0.3316333469332245,,1 +copa,acc,0.76,0.042923469599092816,0 +hellaswag,acc,0.43069109739095796,0.004941609820763587,0 +hellaswag,acc_norm,0.5635331607249552,0.00494933535688186,0 +piqa,acc,0.7426550598476604,0.01019992106479251,0 +piqa,acc_norm,0.749183895538629,0.010113869547069046,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.872,0.010570133761108665,0 +sciq,acc_norm,0.843,0.011510146979230189,0 +storycloze_2016,acc,0.692143238909674,0.01067459815875818,0 +winogrande,acc,0.5453827940015785,0.01399448102706599,0 diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index 9788ca147ec5efefe7a17018677903124233c60f..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057121 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.013826518748493312 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.06585388898066351, - "f1": 0.3316333469332245 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.042923469599092816 - }, - "hellaswag": { - "acc": 0.43069109739095796, - "acc_stderr": 0.004941609820763587, - "acc_norm": 0.5635331607249552, - "acc_norm_stderr": 0.00494933535688186 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5453827940015785, - "acc_stderr": 0.01399448102706599 - }, - "storycloze_2016": { - "acc": 0.692143238909674, - "acc_stderr": 0.01067459815875818 - }, - "boolq": { - "acc": 0.5914373088685015, - "acc_stderr": 0.008597580502718662 - }, - "arc_easy": { - "acc": 0.5968013468013468, - "acc_stderr": 0.010065668576794801, - "acc_norm": 0.5732323232323232, - "acc_norm_stderr": 0.010149141043955635 - }, - "arc_challenge": { - "acc": 0.2525597269624573, - "acc_stderr": 0.012696728980207706, - "acc_norm": 0.28071672354948807, - "acc_norm_stderr": 0.013131238126975586 - }, - "sciq": { - "acc": 0.872, - "acc_stderr": 0.010570133761108665, - "acc_norm": 0.843, - "acc_norm_stderr": 0.011510146979230189 - }, - "piqa": { - "acc": 0.7426550598476604, - "acc_stderr": 0.01019992106479251, - "acc_norm": 0.749183895538629, - "acc_norm_stderr": 0.010113869547069046 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_4.csv b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..ff3c8b4c16a095473c89d1d54d29846df3a027ef --- /dev/null +++ b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.344,0.015029633724408947,0 +anli_r2,acc,0.339,0.014976758771620344,0 +anli_r3,acc,0.3333333333333333,0.013613950010225603,0 +arc_challenge,acc,0.2440273037542662,0.012551447627856257,0 +arc_challenge,acc_norm,0.2773037542662116,0.013082095839059374,0 +arc_easy,acc,0.5968013468013468,0.010065668576794798,0 +arc_easy,acc_norm,0.5740740740740741,0.010146568651002257,0 +boolq,acc,0.5902140672782875,0.008601532621213529,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.36224233283056817,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4298944433379805,0.004940490508240651,0 +hellaswag,acc_norm,0.5671181039633539,0.004944620712318273,0 +piqa,acc,0.7323177366702938,0.010330111189370429,0 +piqa,acc_norm,0.7366702937976061,0.010276185322196764,0 +rte,acc,0.49097472924187724,0.030091559826331334,0 +sciq,acc,0.879,0.010318210380946092,0 +sciq,acc_norm,0.857,0.011075814808567038,0 +storycloze_2016,acc,0.694815606627472,0.01064866438398566,0 +winogrande,acc,0.574585635359116,0.01389525766664638,0 diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json deleted file mode 100644 index 4b97c13c5c50c2fc020d1a6aba4aba177f0cffb0..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408947 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.014976758771620344 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.013613950010225603 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.36224233283056817 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4298944433379805, - "acc_stderr": 0.004940490508240651, - "acc_norm": 0.5671181039633539, - "acc_norm_stderr": 0.004944620712318273 - }, - "rte": { - "acc": 0.49097472924187724, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.01389525766664638 - }, - "storycloze_2016": { - "acc": 0.694815606627472, - "acc_stderr": 0.01064866438398566 - }, - "boolq": { - "acc": 0.5902140672782875, - "acc_stderr": 0.008601532621213529 - }, - "arc_easy": { - "acc": 0.5968013468013468, - "acc_stderr": 0.010065668576794798, - "acc_norm": 0.5740740740740741, - "acc_norm_stderr": 0.010146568651002257 - }, - "arc_challenge": { - "acc": 0.2440273037542662, - "acc_stderr": 0.012551447627856257, - "acc_norm": 0.2773037542662116, - "acc_norm_stderr": 0.013082095839059374 - }, - "sciq": { - "acc": 0.879, - "acc_stderr": 0.010318210380946092, - "acc_norm": 0.857, - "acc_norm_stderr": 0.011075814808567038 - }, - "piqa": { - "acc": 0.7323177366702938, - "acc_stderr": 0.010330111189370429, - "acc_norm": 0.7366702937976061, - "acc_norm_stderr": 0.010276185322196764 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_5.csv b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..2ee579d64525a9bef67ba9ad5e861cc886d6832e --- /dev/null +++ b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.347,0.015060472031706613,0 +anli_r2,acc,0.336,0.014944140233795025,0 +anli_r3,acc,0.3416666666666667,0.013696658778002514,0 +arc_challenge,acc,0.25597269624573377,0.012753013241244518,0 +arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0 +arc_easy,acc,0.6001683501683501,0.010051788039412918,0 +arc_easy,acc_norm,0.5787037037037037,0.010131882498193126,0 +boolq,acc,0.5865443425076453,0.00861305923994264,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.34917043740573156,,1 +copa,acc,0.77,0.04229525846816505,0 +hellaswag,acc,0.42959569806811393,0.004940067402031044,0 +hellaswag,acc_norm,0.5703047201752639,0.004940208641372079,0 +piqa,acc,0.7399347116430903,0.010234893249061293,0 +piqa,acc_norm,0.7383025027203483,0.010255630772708229,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.88,0.010281328012747394,0 +sciq,acc_norm,0.864,0.01084535023047299,0 +storycloze_2016,acc,0.6835916622127205,0.01075478009794089,0 +winogrande,acc,0.55327545382794,0.013972488371616696,0 diff --git a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index c9f24ae81c665ca2fe375e9a83e542a047d09789..0000000000000000000000000000000000000000 --- a/2b855b18bc4seed4/evaluation/rankeval/2b855b18bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.347, - "acc_stderr": 0.015060472031706613 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795025 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002514 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.34917043740573156 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816505 - }, - "hellaswag": { - "acc": 0.42959569806811393, - "acc_stderr": 0.004940067402031044, - "acc_norm": 0.5703047201752639, - "acc_norm_stderr": 0.004940208641372079 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.55327545382794, - "acc_stderr": 0.013972488371616696 - }, - "storycloze_2016": { - "acc": 0.6835916622127205, - "acc_stderr": 0.01075478009794089 - }, - "boolq": { - "acc": 0.5865443425076453, - "acc_stderr": 0.00861305923994264 - }, - "arc_easy": { - "acc": 0.6001683501683501, - "acc_stderr": 0.010051788039412918, - "acc_norm": 0.5787037037037037, - "acc_norm_stderr": 0.010131882498193126 - }, - "arc_challenge": { - "acc": 0.25597269624573377, - "acc_stderr": 0.012753013241244518, - "acc_norm": 0.2832764505119454, - "acc_norm_stderr": 0.013167478735134575 - }, - "sciq": { - "acc": 0.88, - "acc_stderr": 0.010281328012747394, - "acc_norm": 0.864, - "acc_norm_stderr": 0.01084535023047299 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.010234893249061293, - "acc_norm": 0.7383025027203483, - "acc_norm_stderr": 0.010255630772708229 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/generation/merged.csv b/2b855b1b25c4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..0f2aa0c05136d150ddd43bb6c450a6fc23ef8645 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.10808551090509165 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.10808551090509165 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1179471641360343 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1179471641360343 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.12473280680597094 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.12473280680597094 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.12982448053301646 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.12982448053301646 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.13043056637914147 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.13043056637914147 +e2e_nlg_cleaned,5,average,multiple,0.10183675479320914 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0339686568913482 +gem_xsum,0,median,rouge2_fmeasure,0.0339686568913482 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.022059968638918123 +gem_xsum,1,median,rouge2_fmeasure,0.022059968638918123 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.020211478626933593 +gem_xsum,2,median,rouge2_fmeasure,0.020211478626933593 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.018477679820395547 +gem_xsum,3,median,rouge2_fmeasure,0.018477679820395547 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.004755338903452315 +gem_xsum,4,median,rouge2_fmeasure,0.004755338903452315 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00018740433266928376 +gem_xsum,5,median,rouge2_fmeasure,0.00018740433266928376 +gem_xsum,5,average,multiple,0.016610087868952843 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03522441143776143 +web_nlg_en,0,median,rouge2_fmeasure,0.03522441143776143 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.02346063453691177 +web_nlg_en,1,median,rouge2_fmeasure,0.02346063453691177 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.02524749774195254 +web_nlg_en,2,median,rouge2_fmeasure,0.02524749774195254 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.024581431529076308 +web_nlg_en,3,median,rouge2_fmeasure,0.024581431529076308 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.026116709769501194 +web_nlg_en,4,median,rouge2_fmeasure,0.026116709769501194 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.0258534295158266 +web_nlg_en,5,median,rouge2_fmeasure,0.0258534295158266 +web_nlg_en,5,average,multiple,0.026747352421838306 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.02534793433650199 +wiki_lingua_en,0,median,rouge2_fmeasure,0.02534793433650199 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.02452577624245024 +wiki_lingua_en,1,median,rouge2_fmeasure,0.02452577624245024 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.026780562948532143 +wiki_lingua_en,2,median,rouge2_fmeasure,0.026780562948532143 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.021853756801402027 +wiki_lingua_en,3,median,rouge2_fmeasure,0.021853756801402027 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.007501348149931797 +wiki_lingua_en,4,median,rouge2_fmeasure,0.007501348149931797 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0010715469734515022 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0010715469734515022 +wiki_lingua_en,5,average,multiple,0.017846820908711616 diff --git a/2b855b1b25c4seed1/evaluation/generation/merged.json b/2b855b1b25c4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..a69c1b8fdfe86475f22713a03415a543e9efd0f0 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32572585200357784, "bleu_stderr": 0.024884264777712505, "rouge1_fmeasure": 0.08836629283874285, "rouge1_fmeasure_stderr": 0.001933902202882504, "rouge1_precision": 0.06938624687553695, "rouge1_precision_stderr": 0.0022827771046349697, "rouge1_recall": 0.24759254721305518, "rouge1_recall_stderr": 0.004626246644859681, "rouge2_fmeasure": 0.03522441143776143, "rouge2_fmeasure_stderr": 0.0011545622182549142, "rouge2_precision": 0.027738624662658706, "rouge2_precision_stderr": 0.0013441957534332128, "rouge2_recall": 0.10177298488047548, "rouge2_recall_stderr": 0.0028396663546596083, "rougeL_fmeasure": 0.08033963405690978, "rougeL_fmeasure_stderr": 0.001720708314256122, "rougeL_precision": 0.06301753547951804, "rougeL_precision_stderr": 0.002052938942624344, "rougeL_recall": 0.22875144667038966, "rougeL_recall_stderr": 0.004350151227114342, "rougeLsum_fmeasure": 0.08090590477997704, "rougeLsum_fmeasure_stderr": 0.0017626565524760068, "rougeLsum_precision": 0.06405550299013386, "rougeLsum_precision_stderr": 0.0021369146348659894, "rougeLsum_recall": 0.2257505727364471, "rougeLsum_recall_stderr": 0.004155570169881645}}, "1": {"PALM_prompt": {"bleu": 0.2561555153419009, "bleu_stderr": 0.020503391506904913, "rouge1_fmeasure": 0.07157667302309413, "rouge1_fmeasure_stderr": 0.0013947465049788802, "rouge1_precision": 0.04792288448202581, "rouge1_precision_stderr": 0.0014138800360975407, "rouge1_recall": 0.26454448932622804, "rouge1_recall_stderr": 0.004582804713325084, "rouge2_fmeasure": 0.02346063453691177, "rouge2_fmeasure_stderr": 0.0007417178744640911, "rouge2_precision": 0.016018225489946216, "rouge2_precision_stderr": 0.0007550822363380086, "rouge2_recall": 0.09057430348433602, "rouge2_recall_stderr": 0.002711934257795459, "rougeL_fmeasure": 0.0622312622637488, "rougeL_fmeasure_stderr": 0.001169093436012566, "rougeL_precision": 0.041687185550114206, "rougeL_precision_stderr": 0.0012321650233409669, "rougeL_recall": 0.23225066873098027, "rougeL_recall_stderr": 0.003970506422517852, "rougeLsum_fmeasure": 0.06541927653393843, "rougeLsum_fmeasure_stderr": 0.0012656501272991785, "rougeLsum_precision": 0.04384164073704108, "rougeLsum_precision_stderr": 0.0012916241496462786, "rougeLsum_recall": 0.24183075313902289, "rougeLsum_recall_stderr": 0.004151304087758319}}, "2": {"PALM_prompt": {"bleu": 0.29216134763806284, "bleu_stderr": 0.018810216811664723, "rouge1_fmeasure": 0.07546612588436376, "rouge1_fmeasure_stderr": 0.0013350491646123965, "rouge1_precision": 0.048249157638639184, "rouge1_precision_stderr": 0.0012866718245627227, "rouge1_recall": 0.2866727702524528, "rouge1_recall_stderr": 0.004554583398686889, "rouge2_fmeasure": 0.02524749774195254, "rouge2_fmeasure_stderr": 0.0007476111721067034, "rouge2_precision": 0.01642391762691999, "rouge2_precision_stderr": 0.000812195702569356, "rouge2_recall": 0.10260859099926381, "rouge2_recall_stderr": 0.0028390846949135997, "rougeL_fmeasure": 0.06492751409983163, "rougeL_fmeasure_stderr": 0.0011140070204866022, "rougeL_precision": 0.0414398559974983, "rougeL_precision_stderr": 0.0011000916537333813, "rougeL_recall": 0.24881732230934886, "rougeL_recall_stderr": 0.003907427633122226, "rougeLsum_fmeasure": 0.06906850320553264, "rougeLsum_fmeasure_stderr": 0.0012291419473325475, "rougeLsum_precision": 0.0442880148738939, "rougeLsum_precision_stderr": 0.0012235575972288138, "rougeLsum_recall": 0.26237140375920254, "rougeLsum_recall_stderr": 0.004126855560104758}}, "3": {"PALM_prompt": {"bleu": 0.28837217364438783, "bleu_stderr": 0.01734482364386354, "rouge1_fmeasure": 0.075510558557707, "rouge1_fmeasure_stderr": 0.0013460059403738345, "rouge1_precision": 0.04723214570905747, "rouge1_precision_stderr": 0.0010795581743012067, "rouge1_recall": 0.2890465587123036, "rouge1_recall_stderr": 0.004607634538695377, "rouge2_fmeasure": 0.024581431529076308, "rouge2_fmeasure_stderr": 0.0007444727991980332, "rouge2_precision": 0.015279592608775852, "rouge2_precision_stderr": 0.0005777971745783009, "rouge2_recall": 0.10152142246019011, "rouge2_recall_stderr": 0.0029178008276838276, "rougeL_fmeasure": 0.06465280189588929, "rougeL_fmeasure_stderr": 0.0011097313277049047, "rougeL_precision": 0.04032369830512595, "rougeL_precision_stderr": 0.0008792736614100307, "rougeL_recall": 0.25061914296788734, "rougeL_recall_stderr": 0.003972087128050884, "rougeLsum_fmeasure": 0.06908604784621282, "rougeLsum_fmeasure_stderr": 0.0012250308687295043, "rougeLsum_precision": 0.043218907192818706, "rougeLsum_precision_stderr": 0.0009735946333841193, "rougeLsum_recall": 0.2647231787965336, "rougeLsum_recall_stderr": 0.004209755916352629}}, "4": {"PALM_prompt": {"bleu": 0.29183469735894835, "bleu_stderr": 0.01578590267395297, "rouge1_fmeasure": 0.07866380875876984, "rouge1_fmeasure_stderr": 0.001277060999964475, "rouge1_precision": 0.04975694024921801, "rouge1_precision_stderr": 0.00109936382916383, "rouge1_recall": 0.29958965720354436, "rouge1_recall_stderr": 0.004556945156935183, "rouge2_fmeasure": 0.026116709769501194, "rouge2_fmeasure_stderr": 0.0007047251426571129, "rouge2_precision": 0.016269287231396867, "rouge2_precision_stderr": 0.0005187706864040184, "rouge2_recall": 0.10939922703746989, "rouge2_recall_stderr": 0.0029555941363631657, "rougeL_fmeasure": 0.06695675483702884, "rougeL_fmeasure_stderr": 0.0010504584878338998, "rougeL_precision": 0.042321256162322456, "rougeL_precision_stderr": 0.0009383255994510826, "rougeL_recall": 0.2580389112198004, "rougeL_recall_stderr": 0.003883943458877147, "rougeLsum_fmeasure": 0.07215593484915758, "rougeLsum_fmeasure_stderr": 0.0011656444315695205, "rougeLsum_precision": 0.04560923164415852, "rougeLsum_precision_stderr": 0.0010041166570967175, "rougeLsum_recall": 0.2753850874618727, "rougeLsum_recall_stderr": 0.004151771398108518}}, "5": {"PALM_prompt": {"bleu": 0.285980111364132, "bleu_stderr": 0.022376253285834175, "rouge1_fmeasure": 0.07884768542514027, "rouge1_fmeasure_stderr": 0.0013417246790431226, "rouge1_precision": 0.05006289139503477, "rouge1_precision_stderr": 0.0011640922741676001, "rouge1_recall": 0.300199738153936, "rouge1_recall_stderr": 0.004672024566980802, "rouge2_fmeasure": 0.0258534295158266, "rouge2_fmeasure_stderr": 0.000743519185494884, "rouge2_precision": 0.016472172318321005, "rouge2_precision_stderr": 0.0006700891056118128, "rouge2_recall": 0.10856090780207574, "rouge2_recall_stderr": 0.0030158501272052355, "rougeL_fmeasure": 0.06679100411068915, "rougeL_fmeasure_stderr": 0.0011038188698917865, "rougeL_precision": 0.04242592239646745, "rougeL_precision_stderr": 0.0010052603678909523, "rougeL_recall": 0.2569778045396274, "rougeL_recall_stderr": 0.003951367176657829, "rougeLsum_fmeasure": 0.07202797710830955, "rougeLsum_fmeasure_stderr": 0.0012148728769524454, "rougeLsum_precision": 0.04572985638250475, "rougeLsum_precision_stderr": 0.0010703216429484483, "rougeLsum_recall": 0.2762497526776097, "rougeLsum_recall_stderr": 0.004314046428559933}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.9667400684955529, "bleu_stderr": 0.04736004528707971, "rouge1_fmeasure": 0.16021351816275817, "rouge1_fmeasure_stderr": 0.001735611345124232, "rouge1_precision": 0.14354084476430715, "rouge1_precision_stderr": 0.0019344864959865624, "rouge1_recall": 0.22611283271891516, "rouge1_recall_stderr": 0.0024518290050076835, "rouge2_fmeasure": 0.02534793433650199, "rouge2_fmeasure_stderr": 0.0006763243875536915, "rouge2_precision": 0.022996389742916637, "rouge2_precision_stderr": 0.0007173908443070337, "rouge2_recall": 0.0369411082197748, "rouge2_recall_stderr": 0.001126908301758166, "rougeL_fmeasure": 0.12155091165339962, "rougeL_fmeasure_stderr": 0.001225991966151931, "rougeL_precision": 0.10838206537718229, "rougeL_precision_stderr": 0.0014111901886747795, "rougeL_recall": 0.17532455147281636, "rougeL_recall_stderr": 0.0019419678642643332, "rougeLsum_fmeasure": 0.15019542780473918, "rougeLsum_fmeasure_stderr": 0.0016176236122922181, "rougeLsum_precision": 0.13457992261532445, "rougeLsum_precision_stderr": 0.001810756378415834, "rougeLsum_recall": 0.212346997690715, "rougeLsum_recall_stderr": 0.0023036544561547195}}, "1": {"tldr_en": {"bleu": 1.081873532421014, "bleu_stderr": 0.04185591024656912, "rouge1_fmeasure": 0.16035906807458503, "rouge1_fmeasure_stderr": 0.0016387952104774664, "rouge1_precision": 0.1376213661386166, "rouge1_precision_stderr": 0.001705534236336512, "rouge1_recall": 0.23123880715009462, "rouge1_recall_stderr": 0.0022621606951002894, "rouge2_fmeasure": 0.02452577624245024, "rouge2_fmeasure_stderr": 0.000651696493488228, "rouge2_precision": 0.021186784819811674, "rouge2_precision_stderr": 0.0005953235405111804, "rouge2_recall": 0.0357839499690086, "rouge2_recall_stderr": 0.0010431941688998074, "rougeL_fmeasure": 0.11862983745742414, "rougeL_fmeasure_stderr": 0.0011078375827671687, "rougeL_precision": 0.10064460459481238, "rougeL_precision_stderr": 0.0011356725301811719, "rougeL_recall": 0.17570541468253079, "rougeL_recall_stderr": 0.0017649247024355188, "rougeLsum_fmeasure": 0.15086467594141528, "rougeLsum_fmeasure_stderr": 0.0015251006635244222, "rougeLsum_precision": 0.12937529994526567, "rougeLsum_precision_stderr": 0.001591553340490891, "rougeLsum_recall": 0.2180954841469529, "rougeLsum_recall_stderr": 0.0021286676208533304}}, "2": {"tldr_en": {"bleu": 1.1406940508473153, "bleu_stderr": 0.04438516873686383, "rouge1_fmeasure": 0.1646409424544307, "rouge1_fmeasure_stderr": 0.0016603752908741743, "rouge1_precision": 0.14096863425165487, "rouge1_precision_stderr": 0.0017402645333034318, "rouge1_recall": 0.23829037048248744, "rouge1_recall_stderr": 0.00225544872911339, "rouge2_fmeasure": 0.026780562948532143, "rouge2_fmeasure_stderr": 0.0006909155326364937, "rouge2_precision": 0.023159940673783852, "rouge2_precision_stderr": 0.0006190836990761694, "rouge2_recall": 0.03885048155039959, "rouge2_recall_stderr": 0.0011052009671355576, "rougeL_fmeasure": 0.12227448769283658, "rougeL_fmeasure_stderr": 0.0011315257879638573, "rougeL_precision": 0.10339847967305535, "rougeL_precision_stderr": 0.001164109188429874, "rougeL_recall": 0.18202455666829379, "rougeL_recall_stderr": 0.0017659885634231454, "rougeLsum_fmeasure": 0.15495163631438738, "rougeLsum_fmeasure_stderr": 0.0015444341424485212, "rougeLsum_precision": 0.1324428624602557, "rougeLsum_precision_stderr": 0.0016139005093356476, "rougeLsum_recall": 0.22512816235513197, "rougeLsum_recall_stderr": 0.002133968815889736}}, "3": {"tldr_en": {"bleu": 1.2138746815475638, "bleu_stderr": 0.04592549012201426, "rouge1_fmeasure": 0.13877940938140151, "rouge1_fmeasure_stderr": 0.0018666893311103828, "rouge1_precision": 0.12362035168931076, "rouge1_precision_stderr": 0.001984587447206122, "rouge1_recall": 0.19960871133474578, "rouge1_recall_stderr": 0.0026415927194291884, "rouge2_fmeasure": 0.021853756801402027, "rouge2_fmeasure_stderr": 0.0006470834325903754, "rouge2_precision": 0.01930809329422095, "rouge2_precision_stderr": 0.0006077317545873957, "rouge2_recall": 0.03202417050198303, "rouge2_recall_stderr": 0.0010878168197741061, "rougeL_fmeasure": 0.1041409296291964, "rougeL_fmeasure_stderr": 0.0013170854692295673, "rougeL_precision": 0.09230188224011067, "rougeL_precision_stderr": 0.0014285564344012872, "rougeL_recall": 0.15339915425563755, "rougeL_recall_stderr": 0.0020612189061720277, "rougeLsum_fmeasure": 0.13069808692487989, "rougeLsum_fmeasure_stderr": 0.0017452030171091967, "rougeLsum_precision": 0.1164007237397099, "rougeLsum_precision_stderr": 0.001860188243947828, "rougeLsum_recall": 0.18841732070909797, "rougeLsum_recall_stderr": 0.002496578481923037}}, "4": {"tldr_en": {"bleu": 0.2422858180808128, "bleu_stderr": 0.028810204719032197, "rouge1_fmeasure": 0.047039546805527625, "rouge1_fmeasure_stderr": 0.0016053398980841137, "rouge1_precision": 0.04380809344002291, "rouge1_precision_stderr": 0.0016973134943745075, "rouge1_recall": 0.07000716350493573, "rouge1_recall_stderr": 0.002400462652143862, "rouge2_fmeasure": 0.007501348149931797, "rouge2_fmeasure_stderr": 0.0004159633733157211, "rouge2_precision": 0.0069301427139145146, "rouge2_precision_stderr": 0.0004266498914452082, "rouge2_recall": 0.011702414964661427, "rouge2_recall_stderr": 0.0007246985934157146, "rougeL_fmeasure": 0.03565523177459235, "rougeL_fmeasure_stderr": 0.0011880810725397807, "rougeL_precision": 0.03305476880874213, "rougeL_precision_stderr": 0.0012844902889737506, "rougeL_recall": 0.0546280844839468, "rougeL_recall_stderr": 0.0018967540940083882, "rougeLsum_fmeasure": 0.04438718860669264, "rougeLsum_fmeasure_stderr": 0.0015121177727776434, "rougeLsum_precision": 0.04135865721603086, "rougeLsum_precision_stderr": 0.0016072625785908076, "rougeLsum_recall": 0.0662383261663318, "rougeLsum_recall_stderr": 0.0022826638561941606}}, "5": {"tldr_en": {"bleu": 3.493717094687151e-07, "bleu_stderr": 6.691544096134155e-07, "rouge1_fmeasure": 0.007259851950908154, "rouge1_fmeasure_stderr": 0.0006906636902482844, "rouge1_precision": 0.006976454683840875, "rouge1_precision_stderr": 0.0007381838665975385, "rouge1_recall": 0.010945793122418036, "rouge1_recall_stderr": 0.0010487269984637117, "rouge2_fmeasure": 0.0010715469734515022, "rouge2_fmeasure_stderr": 0.00016031532777926015, "rouge2_precision": 0.0010266848511807118, "rouge2_precision_stderr": 0.0001725852708223101, "rouge2_recall": 0.0015549027282555261, "rouge2_recall_stderr": 0.00023759650144642444, "rougeL_fmeasure": 0.005701555681376096, "rougeL_fmeasure_stderr": 0.0005307986664838811, "rougeL_precision": 0.005450689801603827, "rougeL_precision_stderr": 0.0005672687792920024, "rougeL_recall": 0.008723644256083806, "rougeL_recall_stderr": 0.0008322476710870175, "rougeLsum_fmeasure": 0.006816711369037029, "rougeLsum_fmeasure_stderr": 0.0006419087770070392, "rougeLsum_precision": 0.006556513156803884, "rougeLsum_precision_stderr": 0.0006916631225943091, "rougeLsum_recall": 0.010379609187289656, "rougeLsum_recall_stderr": 0.000996631980929296}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}, "1": {"generate_text_restaurant": {"bleu": 4.711665884988567, "bleu_stderr": 0.09266136342700063, "rouge1_fmeasure": 0.3023592112125464, "rouge1_fmeasure_stderr": 0.0020630586612887286, "rouge1_precision": 0.2820010454052785, "rouge1_precision_stderr": 0.0027676833820391104, "rouge1_recall": 0.39333232471005114, "rouge1_recall_stderr": 0.0027340882521585137, "rouge2_fmeasure": 0.10808551090509165, "rouge2_fmeasure_stderr": 0.0013573005637565957, "rouge2_precision": 0.10307229630864055, "rouge2_precision_stderr": 0.0019571868378827654, "rouge2_recall": 0.14259355630191614, "rouge2_recall_stderr": 0.0018127041209698455, "rougeL_fmeasure": 0.21811961093598223, "rougeL_fmeasure_stderr": 0.0014704439369834896, "rougeL_precision": 0.20496306237466308, "rougeL_precision_stderr": 0.0022303078324673554, "rougeL_recall": 0.28682115829558025, "rougeL_recall_stderr": 0.0021412970147450263, "rougeLsum_fmeasure": 0.2518574523269961, "rougeLsum_fmeasure_stderr": 0.0018447448256465858, "rougeLsum_precision": 0.23645943326611307, "rougeLsum_precision_stderr": 0.002521488484888219, "rougeLsum_recall": 0.327552776379214, "rougeLsum_recall_stderr": 0.002454650574201279}}, "2": {"generate_text_restaurant": {"bleu": 4.939155981777199, "bleu_stderr": 0.10397298172344364, "rouge1_fmeasure": 0.308543375337241, "rouge1_fmeasure_stderr": 0.0020169773543979655, "rouge1_precision": 0.27769429731788464, "rouge1_precision_stderr": 0.002564148656275506, "rouge1_recall": 0.4133578653607733, "rouge1_recall_stderr": 0.00268853583482676, "rouge2_fmeasure": 0.1179471641360343, "rouge2_fmeasure_stderr": 0.001357425385904372, "rouge2_precision": 0.10562043268791968, "rouge2_precision_stderr": 0.001565879752727241, "rouge2_recall": 0.16166760215874612, "rouge2_recall_stderr": 0.001902426971424793, "rougeL_fmeasure": 0.2237352196824568, "rougeL_fmeasure_stderr": 0.0014426325417480103, "rougeL_precision": 0.20090264701282012, "rougeL_precision_stderr": 0.0018865600154662852, "rougeL_recall": 0.3040429465945137, "rougeL_recall_stderr": 0.0021831898621806367, "rougeLsum_fmeasure": 0.2576470191668308, "rougeLsum_fmeasure_stderr": 0.001817662559915982, "rougeLsum_precision": 0.2319475194503592, "rougeLsum_precision_stderr": 0.002242447957610098, "rougeLsum_recall": 0.3462153172491861, "rougeLsum_recall_stderr": 0.0024920129459917832}}, "3": {"generate_text_restaurant": {"bleu": 5.2336975526405105, "bleu_stderr": 0.09890742967410301, "rouge1_fmeasure": 0.31716051064091, "rouge1_fmeasure_stderr": 0.0019393647131846835, "rouge1_precision": 0.27936026711491774, "rouge1_precision_stderr": 0.002412113136515013, "rouge1_recall": 0.42856287648845703, "rouge1_recall_stderr": 0.0025616938185847203, "rouge2_fmeasure": 0.12473280680597094, "rouge2_fmeasure_stderr": 0.0013311666433785012, "rouge2_precision": 0.10938140935969894, "rouge2_precision_stderr": 0.001406027255537047, "rouge2_recall": 0.17217769537136587, "rouge2_recall_stderr": 0.001869690547171402, "rougeL_fmeasure": 0.22583582394713633, "rougeL_fmeasure_stderr": 0.0013772117299470112, "rougeL_precision": 0.19827599329668352, "rougeL_precision_stderr": 0.0017186727390283562, "rougeL_recall": 0.30895193212541333, "rougeL_recall_stderr": 0.0020563571014672916, "rougeLsum_fmeasure": 0.26536248446566774, "rougeLsum_fmeasure_stderr": 0.001781481334753249, "rougeLsum_precision": 0.23371465620848672, "rougeLsum_precision_stderr": 0.002111605920776612, "rougeLsum_recall": 0.3591287240709054, "rougeLsum_recall_stderr": 0.0024126486619431274}}, "4": {"generate_text_restaurant": {"bleu": 5.501963533875354, "bleu_stderr": 0.07589153971842322, "rouge1_fmeasure": 0.32403038280036595, "rouge1_fmeasure_stderr": 0.0019198002323294876, "rouge1_precision": 0.2850651198888859, "rouge1_precision_stderr": 0.0023948079942714795, "rouge1_recall": 0.43559108725921547, "rouge1_recall_stderr": 0.00253163770417231, "rouge2_fmeasure": 0.12982448053301646, "rouge2_fmeasure_stderr": 0.0013008288431116738, "rouge2_precision": 0.11367078299710219, "rouge2_precision_stderr": 0.0013822882983325086, "rouge2_recall": 0.17815630100449206, "rouge2_recall_stderr": 0.0018214776974408292, "rougeL_fmeasure": 0.22845301416762678, "rougeL_fmeasure_stderr": 0.0014021608497729226, "rougeL_precision": 0.20057777239700894, "rougeL_precision_stderr": 0.0017241830193517468, "rougeL_recall": 0.3099220930355731, "rougeL_recall_stderr": 0.0020355867358846395, "rougeLsum_fmeasure": 0.2714123161637299, "rougeLsum_fmeasure_stderr": 0.0017678711045128993, "rougeLsum_precision": 0.23871298320915102, "rougeLsum_precision_stderr": 0.0021036813716383074, "rougeLsum_recall": 0.36547462740556785, "rougeLsum_recall_stderr": 0.0023904293692334315}}, "5": {"generate_text_restaurant": {"bleu": 5.4353330979926255, "bleu_stderr": 0.08897011314365992, "rouge1_fmeasure": 0.3253127004033947, "rouge1_fmeasure_stderr": 0.0018822525773453988, "rouge1_precision": 0.28316257604559786, "rouge1_precision_stderr": 0.0023785500997809163, "rouge1_recall": 0.4418122159985122, "rouge1_recall_stderr": 0.0024936835629792947, "rouge2_fmeasure": 0.13043056637914147, "rouge2_fmeasure_stderr": 0.0012830280197393934, "rouge2_precision": 0.11310320228621198, "rouge2_precision_stderr": 0.0013727450084765418, "rouge2_recall": 0.18075712323703044, "rouge2_recall_stderr": 0.0017959173782737853, "rougeL_fmeasure": 0.22961873959887014, "rougeL_fmeasure_stderr": 0.0014017341372490133, "rougeL_precision": 0.1991527037354823, "rougeL_precision_stderr": 0.0017268229946102278, "rougeL_recall": 0.31504197506231296, "rougeL_recall_stderr": 0.0020345363892919485, "rougeLsum_fmeasure": 0.2726733201871582, "rougeLsum_fmeasure_stderr": 0.001758905353201824, "rougeLsum_precision": 0.23742475202749286, "rougeLsum_precision_stderr": 0.002117815534826482, "rougeLsum_recall": 0.3706041908789814, "rougeLsum_recall_stderr": 0.0023531148997161136}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.3067146517578403, "bleu_stderr": 0.06641963209056478, "rouge1_fmeasure": 0.1863790538488017, "rouge1_fmeasure_stderr": 0.002255984739725543, "rouge1_precision": 0.1404003399811309, "rouge1_precision_stderr": 0.002089550126172127, "rouge1_recall": 0.3119095209381341, "rouge1_recall_stderr": 0.003783643983004882, "rouge2_fmeasure": 0.0339686568913482, "rouge2_fmeasure_stderr": 0.0012367046335217662, "rouge2_precision": 0.02537314246064871, "rouge2_precision_stderr": 0.0010456674018307463, "rouge2_recall": 0.05892664067826958, "rouge2_recall_stderr": 0.0021699518904074403, "rougeL_fmeasure": 0.14063634911689182, "rougeL_fmeasure_stderr": 0.0017196992802654802, "rougeL_precision": 0.10650544150928382, "rougeL_precision_stderr": 0.0017091165885559107, "rougeL_recall": 0.23577424920573808, "rougeL_recall_stderr": 0.0029489358603837184, "rougeLsum_fmeasure": 0.1506945934734996, "rougeLsum_fmeasure_stderr": 0.0019013394604372534, "rougeLsum_precision": 0.11359233692180502, "rougeLsum_precision_stderr": 0.0017826241115837874, "rougeLsum_recall": 0.2532578569424056, "rougeLsum_recall_stderr": 0.00328446081573362}}, "1": {"article_DOC_summary": {"bleu": 0.8381237660191255, "bleu_stderr": 0.0927060202856694, "rouge1_fmeasure": 0.1545497910101464, "rouge1_fmeasure_stderr": 0.0020087421392552614, "rouge1_precision": 0.10964906711699166, "rouge1_precision_stderr": 0.0015079176697222103, "rouge1_recall": 0.27268013454350476, "rouge1_recall_stderr": 0.003415548635788548, "rouge2_fmeasure": 0.022059968638918123, "rouge2_fmeasure_stderr": 0.0010112072560759704, "rouge2_precision": 0.015547795047610075, "rouge2_precision_stderr": 0.0007147138027036774, "rouge2_recall": 0.0396196052745503, "rouge2_recall_stderr": 0.001859909411565771, "rougeL_fmeasure": 0.11842169807847193, "rougeL_fmeasure_stderr": 0.0014752005705961232, "rougeL_precision": 0.08384977459410468, "rougeL_precision_stderr": 0.0010974800650410575, "rougeL_recall": 0.21022618727655779, "rougeL_recall_stderr": 0.0026149243332989932, "rougeLsum_fmeasure": 0.1251525369537384, "rougeLsum_fmeasure_stderr": 0.0016532781101759651, "rougeLsum_precision": 0.08861049429735537, "rougeLsum_precision_stderr": 0.0012228373666002105, "rougeLsum_recall": 0.2220524743123222, "rougeLsum_recall_stderr": 0.002927870309103387}}, "2": {"article_DOC_summary": {"bleu": 0.6992590216308486, "bleu_stderr": 0.11467256234969583, "rouge1_fmeasure": 0.15203242673076406, "rouge1_fmeasure_stderr": 0.00198111509266854, "rouge1_precision": 0.10781277909921132, "rouge1_precision_stderr": 0.001488911937277632, "rouge1_recall": 0.26803585021301995, "rouge1_recall_stderr": 0.0032966841261411533, "rouge2_fmeasure": 0.020211478626933593, "rouge2_fmeasure_stderr": 0.0009421419338358723, "rouge2_precision": 0.014236410023715085, "rouge2_precision_stderr": 0.0006700593988760809, "rouge2_recall": 0.0361967764807612, "rouge2_recall_stderr": 0.0016844951039918102, "rougeL_fmeasure": 0.11740431089383181, "rougeL_fmeasure_stderr": 0.001464104007363134, "rougeL_precision": 0.08311189281684, "rougeL_precision_stderr": 0.0010915346979484322, "rougeL_recall": 0.20796145898354557, "rougeL_recall_stderr": 0.002522832932129957, "rougeLsum_fmeasure": 0.12247156690722455, "rougeLsum_fmeasure_stderr": 0.001585731051175008, "rougeLsum_precision": 0.08666813683615872, "rougeLsum_precision_stderr": 0.0011797861476255844, "rougeLsum_recall": 0.2173109750304248, "rougeLsum_recall_stderr": 0.0027511813375711005}}, "3": {"article_DOC_summary": {"bleu": 0.6833532389420252, "bleu_stderr": 0.07367575333198853, "rouge1_fmeasure": 0.14438596890808095, "rouge1_fmeasure_stderr": 0.0020875618465707644, "rouge1_precision": 0.10474622780356196, "rouge1_precision_stderr": 0.0016809385532553902, "rouge1_recall": 0.2503601501638336, "rouge1_recall_stderr": 0.003549781947306463, "rouge2_fmeasure": 0.018477679820395547, "rouge2_fmeasure_stderr": 0.0009002973251009092, "rouge2_precision": 0.013177249807324206, "rouge2_precision_stderr": 0.0006423636578866835, "rouge2_recall": 0.032971750868090996, "rouge2_recall_stderr": 0.001688343488099689, "rougeL_fmeasure": 0.11152645159973636, "rougeL_fmeasure_stderr": 0.0015850389820942076, "rougeL_precision": 0.0808003048083096, "rougeL_precision_stderr": 0.0012676138680810072, "rougeL_recall": 0.19448236037199781, "rougeL_recall_stderr": 0.0028147528789655234, "rougeLsum_fmeasure": 0.11818063015841994, "rougeLsum_fmeasure_stderr": 0.00171784299086193, "rougeLsum_precision": 0.0856691656007358, "rougeLsum_precision_stderr": 0.001381316976035565, "rougeLsum_recall": 0.206013863121834, "rougeLsum_recall_stderr": 0.0030264862999540556}}, "4": {"article_DOC_summary": {"bleu": 0.22586463100887877, "bleu_stderr": 0.04396044693536567, "rouge1_fmeasure": 0.03902928475421105, "rouge1_fmeasure_stderr": 0.0021734852480554566, "rouge1_precision": 0.03313764436243092, "rouge1_precision_stderr": 0.002051212397157048, "rouge1_recall": 0.06192727041930655, "rouge1_recall_stderr": 0.0035372594766604633, "rouge2_fmeasure": 0.004755338903452315, "rouge2_fmeasure_stderr": 0.0005148994284522972, "rouge2_precision": 0.003664939987518973, "rouge2_precision_stderr": 0.0004054448716451556, "rouge2_recall": 0.00817105714210343, "rouge2_recall_stderr": 0.0009506884407170396, "rougeL_fmeasure": 0.03041880188704102, "rougeL_fmeasure_stderr": 0.001676618024306267, "rougeL_precision": 0.02607396732939681, "rougeL_precision_stderr": 0.0016368665556134776, "rougeL_recall": 0.04843995865043651, "rougeL_recall_stderr": 0.002755800206108595, "rougeLsum_fmeasure": 0.032238509954970324, "rougeLsum_fmeasure_stderr": 0.001787106965253106, "rougeLsum_precision": 0.027312091132985057, "rougeLsum_precision_stderr": 0.0016779893035170277, "rougeLsum_recall": 0.05152050400997337, "rougeLsum_recall_stderr": 0.0029522386751175346}}, "5": {"article_DOC_summary": {"bleu": 1.0103963875419344e-38, "bleu_stderr": 2.239218348233867e-33, "rouge1_fmeasure": 0.0020194079841465836, "rouge1_fmeasure_stderr": 0.0005632464860810544, "rouge1_precision": 0.0022525734888460516, "rouge1_precision_stderr": 0.0006339905985573219, "rouge1_recall": 0.001895895954265915, "rouge1_recall_stderr": 0.0005317672705214073, "rouge2_fmeasure": 0.00018740433266928376, "rouge2_fmeasure_stderr": 0.0001102690700883573, "rouge2_precision": 0.00020980376141308438, "rouge2_precision_stderr": 0.00012719836103333043, "rouge2_recall": 0.00017317588072305053, "rouge2_recall_stderr": 0.0001004069765186974, "rougeL_fmeasure": 0.0016059694588278705, "rougeL_fmeasure_stderr": 0.0004385638907412487, "rougeL_precision": 0.0017760664947270475, "rougeL_precision_stderr": 0.0004853651670517887, "rougeL_recall": 0.0015233285795679824, "rougeL_recall_stderr": 0.0004223544796103697, "rougeLsum_fmeasure": 0.0017449039664897972, "rougeLsum_fmeasure_stderr": 0.00048134760367694933, "rougeLsum_precision": 0.0019719620116109395, "rougeLsum_precision_stderr": 0.0005623372051149683, "rougeLsum_recall": 0.001625411259530388, "rougeLsum_recall_stderr": 0.00044236593234674593}}}} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_0.csv b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..e6f99590daa4898b7723b669b373306bfe388f85 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.368,0.015258073561521798,0 +anli_r3,acc,0.3458333333333333,0.013736245342311012,0 +arc_challenge,acc,0.23720136518771331,0.012430399829260835,0 +arc_challenge,acc_norm,0.27303754266211605,0.013019332762635739,0 +arc_easy,acc,0.48442760942760943,0.010254806331961895,0 +arc_easy,acc_norm,0.42592592592592593,0.010146568651002255,0 +boolq,acc,0.4779816513761468,0.008736571521997069,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.34920634920634924,,1 +copa,acc,0.71,0.04560480215720684,0 +hellaswag,acc,0.37044413463453496,0.004819367172685965,0 +hellaswag,acc_norm,0.45309699263095,0.0049677789400119484,0 +piqa,acc,0.7013057671381937,0.01067855639814924,0 +piqa,acc_norm,0.6920565832426551,0.010770892367463671,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.687,0.014671272822977886,0 +sciq,acc_norm,0.622,0.015341165254026649,0 +storycloze_2016,acc,0.6440406199893105,0.011072254184382837,0 +winogrande,acc,0.5193370165745856,0.01404197273371297,0 diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_0_lm-eval_global_step52452_2023-02-11-12-41-27_0shots_backup.json b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_0_lm-eval_global_step52452_2023-02-11-12-41-27_0shots_backup.json deleted file mode 100644 index 8e8518df5b93c66ad0d01fbabbfe74aa5ce36090..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_0_lm-eval_global_step52452_2023-02-11-12-41-27_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.368, - "acc_stderr": 0.015258073561521798 - }, - "anli_r3": { - "acc": 0.3458333333333333, - "acc_stderr": 0.013736245342311012 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.34920634920634924 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.04560480215720684 - }, - "hellaswag": { - "acc": 0.37044413463453496, - "acc_stderr": 0.004819367172685965, - "acc_norm": 0.45309699263095, - "acc_norm_stderr": 0.0049677789400119484 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5193370165745856, - "acc_stderr": 0.01404197273371297 - }, - "storycloze_2016": { - "acc": 0.6440406199893105, - "acc_stderr": 0.011072254184382837 - }, - "boolq": { - "acc": 0.4779816513761468, - "acc_stderr": 0.008736571521997069 - }, - "arc_easy": { - "acc": 0.48442760942760943, - "acc_stderr": 0.010254806331961895, - "acc_norm": 0.42592592592592593, - "acc_norm_stderr": 0.010146568651002255 - }, - "arc_challenge": { - "acc": 0.23720136518771331, - "acc_stderr": 0.012430399829260835, - "acc_norm": 0.27303754266211605, - "acc_norm_stderr": 0.013019332762635739 - }, - "sciq": { - "acc": 0.687, - "acc_stderr": 0.014671272822977886, - "acc_norm": 0.622, - "acc_norm_stderr": 0.015341165254026649 - }, - "piqa": { - "acc": 0.7013057671381937, - "acc_stderr": 0.01067855639814924, - "acc_norm": 0.6920565832426551, - "acc_norm_stderr": 0.010770892367463671 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_1.csv b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..9797c0b4626b170e4c89993a24b770632008b366 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732961,0 +anli_r2,acc,0.382,0.015372453034968519,0 +anli_r3,acc,0.35333333333333333,0.013804572162314932,0 +arc_challenge,acc,0.23720136518771331,0.01243039982926084,0 +arc_challenge,acc_norm,0.27047781569965873,0.012980954547659554,0 +arc_easy,acc,0.4663299663299663,0.010236494647406476,0 +arc_easy,acc_norm,0.43813131313131315,0.010180937100600074,0 +boolq,acc,0.4651376146788991,0.00872377184445813,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3489401351036571,,1 +copa,acc,0.68,0.04688261722621505,0 +hellaswag,acc,0.36805417247560246,0.0048129052790664385,0 +hellaswag,acc_norm,0.4478191595299741,0.004962534264751923,0 +piqa,acc,0.6811751904243744,0.010873037534333418,0 +piqa,acc_norm,0.6751904243743199,0.01092629623829404,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.682,0.014734079309311901,0 +sciq,acc_norm,0.644,0.015149042659306621,0 +storycloze_2016,acc,0.6397648316408338,0.011101519668493525,0 +winogrande,acc,0.5240726124704025,0.014036189665395132,0 diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_1_lm-eval_global_step52452_2023-02-11-12-41-27_1shots_backup.json b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_1_lm-eval_global_step52452_2023-02-11-12-41-27_1shots_backup.json deleted file mode 100644 index 12ff665e79e8315008ac48bcfeb78811329078df..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_1_lm-eval_global_step52452_2023-02-11-12-41-27_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732961 - }, - "anli_r2": { - "acc": 0.382, - "acc_stderr": 0.015372453034968519 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314932 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3489401351036571 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621505 - }, - "hellaswag": { - "acc": 0.36805417247560246, - "acc_stderr": 0.0048129052790664385, - "acc_norm": 0.4478191595299741, - "acc_norm_stderr": 0.004962534264751923 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5240726124704025, - "acc_stderr": 0.014036189665395132 - }, - "storycloze_2016": { - "acc": 0.6397648316408338, - "acc_stderr": 0.011101519668493525 - }, - "boolq": { - "acc": 0.4651376146788991, - "acc_stderr": 0.00872377184445813 - }, - "arc_easy": { - "acc": 0.4663299663299663, - "acc_stderr": 0.010236494647406476, - "acc_norm": 0.43813131313131315, - "acc_norm_stderr": 0.010180937100600074 - }, - "arc_challenge": { - "acc": 0.23720136518771331, - "acc_stderr": 0.01243039982926084, - "acc_norm": 0.27047781569965873, - "acc_norm_stderr": 0.012980954547659554 - }, - "sciq": { - "acc": 0.682, - "acc_stderr": 0.014734079309311901, - "acc_norm": 0.644, - "acc_norm_stderr": 0.015149042659306621 - }, - "piqa": { - "acc": 0.6811751904243744, - "acc_stderr": 0.010873037534333418, - "acc_norm": 0.6751904243743199, - "acc_norm_stderr": 0.01092629623829404 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_2.csv b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..0f70d288c6033318ef7b79075e747c36e65ef917 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.014830507204541045,0 +anli_r2,acc,0.345,0.015039986742055235,0 +anli_r3,acc,0.3416666666666667,0.01369665877800252,0 +arc_challenge,acc,0.2295221843003413,0.012288926760890776,0 +arc_challenge,acc_norm,0.2696245733788396,0.012968040686869148,0 +arc_easy,acc,0.4713804713804714,0.010242962617927188,0 +arc_easy,acc_norm,0.4574915824915825,0.010222638127749486,0 +boolq,acc,0.4584097859327217,0.008714749017709892,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2744252873563218,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.3701453893646684,0.004818566366066918,0 +hellaswag,acc_norm,0.4473212507468632,0.004962010338226348,0 +piqa,acc,0.6795429815016322,0.010887766073814885,0 +piqa,acc_norm,0.676278563656148,0.010916765010708766,0 +rte,acc,0.4981949458483754,0.030096267148976626,0 +sciq,acc,0.676,0.014806864733738854,0 +sciq,acc_norm,0.67,0.014876872027456741,0 +storycloze_2016,acc,0.6365579903794762,0.01112284144205971,0 +winogrande,acc,0.505130228887135,0.014051745961790516,0 diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json deleted file mode 100644 index 1fbdef910f6976b727cab60712e66465149b7506..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.014830507204541045 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055235 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.01369665877800252 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.2744252873563218 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - }, - "hellaswag": { - "acc": 0.3701453893646684, - "acc_stderr": 0.004818566366066918, - "acc_norm": 0.4473212507468632, - "acc_norm_stderr": 0.004962010338226348 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.505130228887135, - "acc_stderr": 0.014051745961790516 - }, - "storycloze_2016": { - "acc": 0.6365579903794762, - "acc_stderr": 0.01112284144205971 - }, - "boolq": { - "acc": 0.4584097859327217, - "acc_stderr": 0.008714749017709892 - }, - "arc_easy": { - "acc": 0.4713804713804714, - "acc_stderr": 0.010242962617927188, - "acc_norm": 0.4574915824915825, - "acc_norm_stderr": 0.010222638127749486 - }, - "arc_challenge": { - "acc": 0.2295221843003413, - "acc_stderr": 0.012288926760890776, - "acc_norm": 0.2696245733788396, - "acc_norm_stderr": 0.012968040686869148 - }, - "sciq": { - "acc": 0.676, - "acc_stderr": 0.014806864733738854, - "acc_norm": 0.67, - "acc_norm_stderr": 0.014876872027456741 - }, - "piqa": { - "acc": 0.6795429815016322, - "acc_stderr": 0.010887766073814885, - "acc_norm": 0.676278563656148, - "acc_norm_stderr": 0.010916765010708766 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_3.csv b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..7473bc107068e0591420403cecae07c86059a185 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.014853842487270334,0 +anli_r2,acc,0.364,0.015222868840522022,0 +anli_r3,acc,0.3358333333333333,0.013639261190932879,0 +arc_challenge,acc,0.22781569965870307,0.012256708602326916,0 +arc_challenge,acc_norm,0.2619453924914676,0.012849054826858117,0 +arc_easy,acc,0.4574915824915825,0.010222638127749482,0 +arc_easy,acc_norm,0.4494949494949495,0.010207308833916044,0 +boolq,acc,0.44281345565749236,0.00868766876693082,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.32970225127087877,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.36984664409480184,0.00481776358141023,0 +hellaswag,acc_norm,0.4487153953395738,0.00496346465774724,0 +piqa,acc,0.6866158868335147,0.010822829929195485,0 +piqa,acc_norm,0.6833514689880305,0.010853160531978486,0 +rte,acc,0.48736462093862815,0.030086851767188564,0 +sciq,acc,0.689,0.014645596385722697,0 +sciq,acc_norm,0.675,0.014818724459095526,0 +storycloze_2016,acc,0.6344200962052379,0.01113675894768839,0 +winogrande,acc,0.5224940805051302,0.014038257824059878,0 diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json deleted file mode 100644 index 9058012e306b118e5a8f90d2e2d96ecd269d2f32..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.014853842487270334 - }, - "anli_r2": { - "acc": 0.364, - "acc_stderr": 0.015222868840522022 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932879 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.32970225127087877 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - }, - "hellaswag": { - "acc": 0.36984664409480184, - "acc_stderr": 0.00481776358141023, - "acc_norm": 0.4487153953395738, - "acc_norm_stderr": 0.00496346465774724 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5224940805051302, - "acc_stderr": 0.014038257824059878 - }, - "storycloze_2016": { - "acc": 0.6344200962052379, - "acc_stderr": 0.01113675894768839 - }, - "boolq": { - "acc": 0.44281345565749236, - "acc_stderr": 0.00868766876693082 - }, - "arc_easy": { - "acc": 0.4574915824915825, - "acc_stderr": 0.010222638127749482, - "acc_norm": 0.4494949494949495, - "acc_norm_stderr": 0.010207308833916044 - }, - "arc_challenge": { - "acc": 0.22781569965870307, - "acc_stderr": 0.012256708602326916, - "acc_norm": 0.2619453924914676, - "acc_norm_stderr": 0.012849054826858117 - }, - "sciq": { - "acc": 0.689, - "acc_stderr": 0.014645596385722697, - "acc_norm": 0.675, - "acc_norm_stderr": 0.014818724459095526 - }, - "piqa": { - "acc": 0.6866158868335147, - "acc_stderr": 0.010822829929195485, - "acc_norm": 0.6833514689880305, - "acc_norm_stderr": 0.010853160531978486 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_4.csv b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..0fc8d5f17d97cb27df28783b1920d556daba4360 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203936,0 +anli_r2,acc,0.365,0.015231776226264912,0 +anli_r3,acc,0.32666666666666666,0.013544340907003663,0 +arc_challenge,acc,0.23208191126279865,0.012336718284948853,0 +arc_challenge,acc_norm,0.25597269624573377,0.012753013241244521,0 +arc_easy,acc,0.4675925925925926,0.01023821036880189,0 +arc_easy,acc_norm,0.43897306397306396,0.010183076012972064,0 +boolq,acc,0.4415902140672783,0.008685178933161664,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3114372233869089,,1 +copa,acc,0.62,0.04878317312145632,0 +hellaswag,acc,0.36755626369249156,0.004811543077792728,0 +hellaswag,acc_norm,0.44612626966739694,0.0049607323822552455,0 +piqa,acc,0.691512513601741,0.010776164678037159,0 +piqa,acc_norm,0.6958650707290533,0.010733493335721316,0 +rte,acc,0.47653429602888087,0.03006330041190266,0 +sciq,acc,0.69,0.014632638658632903,0 +sciq,acc_norm,0.678,0.014782913600996659,0 +storycloze_2016,acc,0.623730625334046,0.011202815067213618,0 +winogrande,acc,0.5114443567482242,0.014048804199859329,0 diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_4_lm-eval_global_step52452_2023-02-11-12-41-27_4shots_backup.json b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_4_lm-eval_global_step52452_2023-02-11-12-41-27_4shots_backup.json deleted file mode 100644 index 49163c6363fb857db445146c33c0d145144feb00..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_4_lm-eval_global_step52452_2023-02-11-12-41-27_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203936 - }, - "anli_r2": { - "acc": 0.365, - "acc_stderr": 0.015231776226264912 - }, - "anli_r3": { - "acc": 0.32666666666666666, - "acc_stderr": 0.013544340907003663 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3114372233869089 - }, - "copa": { - "acc": 0.62, - "acc_stderr": 0.04878317312145632 - }, - "hellaswag": { - "acc": 0.36755626369249156, - "acc_stderr": 0.004811543077792728, - "acc_norm": 0.44612626966739694, - "acc_norm_stderr": 0.0049607323822552455 - }, - "rte": { - "acc": 0.47653429602888087, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5114443567482242, - "acc_stderr": 0.014048804199859329 - }, - "storycloze_2016": { - "acc": 0.623730625334046, - "acc_stderr": 0.011202815067213618 - }, - "boolq": { - "acc": 0.4415902140672783, - "acc_stderr": 0.008685178933161664 - }, - "arc_easy": { - "acc": 0.4675925925925926, - "acc_stderr": 0.01023821036880189, - "acc_norm": 0.43897306397306396, - "acc_norm_stderr": 0.010183076012972064 - }, - "arc_challenge": { - "acc": 0.23208191126279865, - "acc_stderr": 0.012336718284948853, - "acc_norm": 0.25597269624573377, - "acc_norm_stderr": 0.012753013241244521 - }, - "sciq": { - "acc": 0.69, - "acc_stderr": 0.014632638658632903, - "acc_norm": 0.678, - "acc_norm_stderr": 0.014782913600996659 - }, - "piqa": { - "acc": 0.691512513601741, - "acc_stderr": 0.010776164678037159, - "acc_norm": 0.6958650707290533, - "acc_norm_stderr": 0.010733493335721316 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_5.csv b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..39561e9355f772bdec500917cfa4fb29a5e5ae58 --- /dev/null +++ b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229863,0 +anli_r2,acc,0.368,0.015258073561521802,0 +anli_r3,acc,0.34833333333333333,0.01375943749887408,0 +arc_challenge,acc,0.23890784982935154,0.012461071376316614,0 +arc_challenge,acc_norm,0.2721843003412969,0.013006600406423709,0 +arc_easy,acc,0.4663299663299663,0.010236494647406476,0 +arc_easy,acc_norm,0.44991582491582494,0.010208181969301794,0 +boolq,acc,0.44678899082568807,0.008695392261996197,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2605042016806723,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.37293367855008963,0.004825963768772218,0 +hellaswag,acc_norm,0.44911372236606256,0.00496387293685794,0 +piqa,acc,0.6936887921653971,0.010754970032367321,0 +piqa,acc_norm,0.6958650707290533,0.010733493335721316,0 +rte,acc,0.4981949458483754,0.030096267148976626,0 +sciq,acc,0.701,0.014484778521220477,0 +sciq,acc_norm,0.68,0.01475865230357488,0 +storycloze_2016,acc,0.6296098343132015,0.01116720970729424,0 +winogrande,acc,0.5430149960536701,0.01400038676159829,0 diff --git a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_5_lm-eval_global_step52452_2023-02-11-12-41-26_5shots_backup.json b/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_5_lm-eval_global_step52452_2023-02-11-12-41-26_5shots_backup.json deleted file mode 100644 index 8815a8861593499ab751faeee83613024f03d6fc..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed1/evaluation/rankeval/2b855b1b25c4seed1_5_lm-eval_global_step52452_2023-02-11-12-41-26_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229863 - }, - "anli_r2": { - "acc": 0.368, - "acc_stderr": 0.015258073561521802 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.01375943749887408 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2605042016806723 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.37293367855008963, - "acc_stderr": 0.004825963768772218, - "acc_norm": 0.44911372236606256, - "acc_norm_stderr": 0.00496387293685794 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.5430149960536701, - "acc_stderr": 0.01400038676159829 - }, - "storycloze_2016": { - "acc": 0.6296098343132015, - "acc_stderr": 0.01116720970729424 - }, - "boolq": { - "acc": 0.44678899082568807, - "acc_stderr": 0.008695392261996197 - }, - "arc_easy": { - "acc": 0.4663299663299663, - "acc_stderr": 0.010236494647406476, - "acc_norm": 0.44991582491582494, - "acc_norm_stderr": 0.010208181969301794 - }, - "arc_challenge": { - "acc": 0.23890784982935154, - "acc_stderr": 0.012461071376316614, - "acc_norm": 0.2721843003412969, - "acc_norm_stderr": 0.013006600406423709 - }, - "sciq": { - "acc": 0.701, - "acc_stderr": 0.014484778521220477, - "acc_norm": 0.68, - "acc_norm_stderr": 0.01475865230357488 - }, - "piqa": { - "acc": 0.6936887921653971, - "acc_stderr": 0.010754970032367321, - "acc_norm": 0.6958650707290533, - "acc_norm_stderr": 0.010733493335721316 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/generation/merged.csv b/2b855b1b25c4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..4aafa94995fa798ab5a5aef2aa3a9838cc5e7c8b --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.006146989261393382 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.006146989261393382 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.0931934839990174 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.0931934839990174 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.10811536820309103 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.10811536820309103 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.11497167650749299 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.11497167650749299 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.11851800279247307 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.11851800279247307 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.11692418277126329 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.11692418277126329 +e2e_nlg_cleaned,5,average,multiple,0.09297828392245519 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03158604815509799 +gem_xsum,0,median,rouge2_fmeasure,0.03158604815509799 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.022348403259935444 +gem_xsum,1,median,rouge2_fmeasure,0.022348403259935444 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.01989415677106089 +gem_xsum,2,median,rouge2_fmeasure,0.01989415677106089 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.01877267165553685 +gem_xsum,3,median,rouge2_fmeasure,0.01877267165553685 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0046528406004404926 +gem_xsum,4,median,rouge2_fmeasure,0.0046528406004404926 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00020335003485991417 +gem_xsum,5,median,rouge2_fmeasure,0.00020335003485991417 +gem_xsum,5,average,multiple,0.016242911746155265 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03361291428001285 +web_nlg_en,0,median,rouge2_fmeasure,0.03361291428001285 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.026950252151254912 +web_nlg_en,1,median,rouge2_fmeasure,0.026950252151254912 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.02831144904494951 +web_nlg_en,2,median,rouge2_fmeasure,0.02831144904494951 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.027378655811279583 +web_nlg_en,3,median,rouge2_fmeasure,0.027378655811279583 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.027494435136342994 +web_nlg_en,4,median,rouge2_fmeasure,0.027494435136342994 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.028109209706724696 +web_nlg_en,5,median,rouge2_fmeasure,0.028109209706724696 +web_nlg_en,5,average,multiple,0.02864281935509409 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03471793573296706 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03471793573296706 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03122818349853155 +wiki_lingua_en,1,median,rouge2_fmeasure,0.03122818349853155 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.030125176481577847 +wiki_lingua_en,2,median,rouge2_fmeasure,0.030125176481577847 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.025634833926844586 +wiki_lingua_en,3,median,rouge2_fmeasure,0.025634833926844586 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.008539104174148937 +wiki_lingua_en,4,median,rouge2_fmeasure,0.008539104174148937 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0015648741591109756 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0015648741591109756 +wiki_lingua_en,5,average,multiple,0.021968351328863493 diff --git a/2b855b1b25c4seed2/evaluation/generation/merged.json b/2b855b1b25c4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..29beb96859c2efa794ee20ceace0e4db19d51a36 --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2407893853989093, "bleu_stderr": 0.01588675590631443, "rouge1_fmeasure": 0.08494484098239038, "rouge1_fmeasure_stderr": 0.0017186671495870607, "rouge1_precision": 0.06655956584090508, "rouge1_precision_stderr": 0.0025692284584229413, "rouge1_recall": 0.2701266893978289, "rouge1_recall_stderr": 0.0045880346269875, "rouge2_fmeasure": 0.03361291428001285, "rouge2_fmeasure_stderr": 0.001015988273750564, "rouge2_precision": 0.024817271490369153, "rouge2_precision_stderr": 0.001254168549127894, "rouge2_recall": 0.10844263014333566, "rouge2_recall_stderr": 0.002972720716652751, "rougeL_fmeasure": 0.07718401780693002, "rougeL_fmeasure_stderr": 0.0015157272326543677, "rougeL_precision": 0.06108068878311405, "rougeL_precision_stderr": 0.002464111445153674, "rougeL_recall": 0.24850892342517508, "rougeL_recall_stderr": 0.004279386960691826, "rougeLsum_fmeasure": 0.07817511800881678, "rougeLsum_fmeasure_stderr": 0.001559290363753288, "rougeLsum_precision": 0.061953267333984124, "rougeLsum_precision_stderr": 0.0024881595548451045, "rougeLsum_recall": 0.24863965350663209, "rougeLsum_recall_stderr": 0.0042105114977878545}}, "1": {"PALM_prompt": {"bleu": 0.22522389450429012, "bleu_stderr": 0.022678487191956642, "rouge1_fmeasure": 0.07596188918055159, "rouge1_fmeasure_stderr": 0.0014412398435713432, "rouge1_precision": 0.05012020875791923, "rouge1_precision_stderr": 0.0014036746899904267, "rouge1_recall": 0.26661208341503134, "rouge1_recall_stderr": 0.004516745120322006, "rouge2_fmeasure": 0.026950252151254912, "rouge2_fmeasure_stderr": 0.0008230032859902024, "rouge2_precision": 0.017398516908712573, "rouge2_precision_stderr": 0.0006760027710222252, "rouge2_recall": 0.09816202259453922, "rouge2_recall_stderr": 0.0028397513917673583, "rougeL_fmeasure": 0.06681451175940797, "rougeL_fmeasure_stderr": 0.001247934084398306, "rougeL_precision": 0.04419092741917541, "rougeL_precision_stderr": 0.0013010529117962037, "rougeL_recall": 0.23680781762230116, "rougeL_recall_stderr": 0.004011755119184708, "rougeLsum_fmeasure": 0.0702224328115709, "rougeLsum_fmeasure_stderr": 0.001320629835003517, "rougeLsum_precision": 0.046506920526176006, "rougeLsum_precision_stderr": 0.0013451685592365983, "rougeLsum_recall": 0.24650189035915923, "rougeLsum_recall_stderr": 0.004103162325134287}}, "2": {"PALM_prompt": {"bleu": 0.25574132001032984, "bleu_stderr": 0.01924046688367454, "rouge1_fmeasure": 0.07960237393100172, "rouge1_fmeasure_stderr": 0.0014202343857750695, "rouge1_precision": 0.051544259676055075, "rouge1_precision_stderr": 0.0013361418191430803, "rouge1_recall": 0.2782887384096015, "rouge1_recall_stderr": 0.004454555368692382, "rouge2_fmeasure": 0.02831144904494951, "rouge2_fmeasure_stderr": 0.0008442346182865687, "rouge2_precision": 0.018467622376252272, "rouge2_precision_stderr": 0.0008562727403008964, "rouge2_recall": 0.10392144735947444, "rouge2_recall_stderr": 0.0027833324922700076, "rougeL_fmeasure": 0.06932607315534434, "rougeL_fmeasure_stderr": 0.00123157160941625, "rougeL_precision": 0.044934317995986184, "rougeL_precision_stderr": 0.0012165055928314093, "rougeL_recall": 0.24408713823143852, "rougeL_recall_stderr": 0.003911678761388413, "rougeLsum_fmeasure": 0.07408277798601287, "rougeLsum_fmeasure_stderr": 0.0013241392286996102, "rougeLsum_precision": 0.04797284041540702, "rougeLsum_precision_stderr": 0.0012624053649367935, "rougeLsum_recall": 0.25885442081082094, "rougeLsum_recall_stderr": 0.004144855140650182}}, "3": {"PALM_prompt": {"bleu": 0.260560998951591, "bleu_stderr": 0.017409501356744673, "rouge1_fmeasure": 0.07809353388646345, "rouge1_fmeasure_stderr": 0.0013346021775667018, "rouge1_precision": 0.04898443839175938, "rouge1_precision_stderr": 0.0009932814694225442, "rouge1_recall": 0.27617445046301914, "rouge1_recall_stderr": 0.004458678374109601, "rouge2_fmeasure": 0.027378655811279583, "rouge2_fmeasure_stderr": 0.0007720653662895008, "rouge2_precision": 0.01707963873841892, "rouge2_precision_stderr": 0.0005438610927253239, "rouge2_recall": 0.10274743397701377, "rouge2_recall_stderr": 0.0028055089596719456, "rougeL_fmeasure": 0.06739410679194623, "rougeL_fmeasure_stderr": 0.0011221989826773612, "rougeL_precision": 0.042149384889014256, "rougeL_precision_stderr": 0.0008192118902301488, "rougeL_recall": 0.2405027355796753, "rougeL_recall_stderr": 0.0038484118564200633, "rougeLsum_fmeasure": 0.0722943277611927, "rougeLsum_fmeasure_stderr": 0.0012453246674067316, "rougeLsum_precision": 0.045377242215070196, "rougeLsum_precision_stderr": 0.0009269885115102065, "rougeLsum_recall": 0.2550543972901675, "rougeLsum_recall_stderr": 0.0040672871463552605}}, "4": {"PALM_prompt": {"bleu": 0.2537584394933947, "bleu_stderr": 0.015357690992346443, "rouge1_fmeasure": 0.0804776969361547, "rouge1_fmeasure_stderr": 0.0013896594175415089, "rouge1_precision": 0.05171806175833278, "rouge1_precision_stderr": 0.0013164637308980549, "rouge1_recall": 0.2783059110853954, "rouge1_recall_stderr": 0.004379768669652587, "rouge2_fmeasure": 0.027494435136342994, "rouge2_fmeasure_stderr": 0.000793324673576191, "rouge2_precision": 0.017364188476361977, "rouge2_precision_stderr": 0.0006430608865591611, "rouge2_recall": 0.10167089339806358, "rouge2_recall_stderr": 0.002708406746138074, "rougeL_fmeasure": 0.06896598685996097, "rougeL_fmeasure_stderr": 0.0011581462402353198, "rougeL_precision": 0.044403534855836306, "rougeL_precision_stderr": 0.001179432109775957, "rougeL_recall": 0.24097904325433545, "rougeL_recall_stderr": 0.0037474191337320177, "rougeLsum_fmeasure": 0.07447876809190658, "rougeLsum_fmeasure_stderr": 0.0012696025569026094, "rougeLsum_precision": 0.047964647290641316, "rougeLsum_precision_stderr": 0.00124826823684023, "rougeLsum_recall": 0.2584833170495252, "rougeLsum_recall_stderr": 0.004045243463966518}}, "5": {"PALM_prompt": {"bleu": 0.2632355603765274, "bleu_stderr": 0.018303453610282024, "rouge1_fmeasure": 0.07962368621161962, "rouge1_fmeasure_stderr": 0.0013178607620441577, "rouge1_precision": 0.0500159112126146, "rouge1_precision_stderr": 0.001038509322024226, "rouge1_recall": 0.28574128003741156, "rouge1_recall_stderr": 0.004616644112898593, "rouge2_fmeasure": 0.028109209706724696, "rouge2_fmeasure_stderr": 0.0007418532299792448, "rouge2_precision": 0.01762390928712724, "rouge2_precision_stderr": 0.0005915956013115531, "rouge2_recall": 0.11027241887558425, "rouge2_recall_stderr": 0.0028779282205328356, "rougeL_fmeasure": 0.068209186998192, "rougeL_fmeasure_stderr": 0.0010825456909171211, "rougeL_precision": 0.042787278508444805, "rougeL_precision_stderr": 0.0008569931125629821, "rougeL_recall": 0.24761321949047066, "rougeL_recall_stderr": 0.003946470892602998, "rougeLsum_fmeasure": 0.07375494170325493, "rougeLsum_fmeasure_stderr": 0.0012104240130421113, "rougeLsum_precision": 0.046380453818363254, "rougeLsum_precision_stderr": 0.0009682888591314917, "rougeLsum_recall": 0.265438762514052, "rougeLsum_recall_stderr": 0.004263670073662336}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5471558673059522, "bleu_stderr": 0.06041257257462159, "rouge1_fmeasure": 0.182981417207648, "rouge1_fmeasure_stderr": 0.0018451127320063625, "rouge1_precision": 0.1616451386974564, "rouge1_precision_stderr": 0.0020350918196064384, "rouge1_recall": 0.25721334563076603, "rouge1_recall_stderr": 0.002498466792994716, "rouge2_fmeasure": 0.03471793573296706, "rouge2_fmeasure_stderr": 0.0007971277340144285, "rouge2_precision": 0.030875325619039554, "rouge2_precision_stderr": 0.0007643578843856369, "rouge2_recall": 0.04923386087456456, "rouge2_recall_stderr": 0.0012469699695637354, "rougeL_fmeasure": 0.13638653276318996, "rougeL_fmeasure_stderr": 0.001268168402228597, "rougeL_precision": 0.11915355957677379, "rougeL_precision_stderr": 0.0014004509865827195, "rougeL_recall": 0.19699163362911626, "rougeL_recall_stderr": 0.0019799030205421486, "rougeLsum_fmeasure": 0.17125534703344605, "rougeLsum_fmeasure_stderr": 0.001708813329325487, "rougeLsum_precision": 0.15107917688090688, "rougeLsum_precision_stderr": 0.0018890187686542242, "rougeLsum_recall": 0.24170343372575817, "rougeLsum_recall_stderr": 0.002359582728755557}}, "1": {"tldr_en": {"bleu": 1.4133621259888471, "bleu_stderr": 0.042687423961181435, "rouge1_fmeasure": 0.17586357172129555, "rouge1_fmeasure_stderr": 0.0017726793221739814, "rouge1_precision": 0.1519935670519904, "rouge1_precision_stderr": 0.0018708325280017745, "rouge1_recall": 0.2508067112440759, "rouge1_recall_stderr": 0.002397294505057444, "rouge2_fmeasure": 0.03122818349853155, "rouge2_fmeasure_stderr": 0.000761656027140545, "rouge2_precision": 0.027189297320145984, "rouge2_precision_stderr": 0.0007029269825173658, "rouge2_recall": 0.04471070181824774, "rouge2_recall_stderr": 0.0011941128507048286, "rougeL_fmeasure": 0.12886773044418606, "rougeL_fmeasure_stderr": 0.0011911678276133426, "rougeL_precision": 0.1101238478887212, "rougeL_precision_stderr": 0.0012465708076326942, "rougeL_recall": 0.18876885832750953, "rougeL_recall_stderr": 0.0018539954302858293, "rougeLsum_fmeasure": 0.16521096629383403, "rougeLsum_fmeasure_stderr": 0.001646340029121732, "rougeLsum_precision": 0.1425534902173997, "rougeLsum_precision_stderr": 0.0017374948053666193, "rougeLsum_recall": 0.23644655439048787, "rougeLsum_recall_stderr": 0.002259372578709768}}, "2": {"tldr_en": {"bleu": 1.3477157344527648, "bleu_stderr": 0.05073112878215326, "rouge1_fmeasure": 0.17448122072869857, "rouge1_fmeasure_stderr": 0.0017745053208950148, "rouge1_precision": 0.15147666632434625, "rouge1_precision_stderr": 0.0018958859946232175, "rouge1_recall": 0.24742370074582315, "rouge1_recall_stderr": 0.002384180430192293, "rouge2_fmeasure": 0.030125176481577847, "rouge2_fmeasure_stderr": 0.0007380615549439433, "rouge2_precision": 0.026338248615619973, "rouge2_precision_stderr": 0.0006873843559945932, "rouge2_recall": 0.04318168444900552, "rouge2_recall_stderr": 0.001170944305512945, "rougeL_fmeasure": 0.12813319775413898, "rougeL_fmeasure_stderr": 0.001188860025297456, "rougeL_precision": 0.10978419827783621, "rougeL_precision_stderr": 0.0012479268251534734, "rougeL_recall": 0.18690289594643067, "rougeL_recall_stderr": 0.0018511648329877309, "rougeLsum_fmeasure": 0.16393820027553613, "rougeLsum_fmeasure_stderr": 0.001660700052668571, "rougeLsum_precision": 0.14217854010260553, "rougeLsum_precision_stderr": 0.0017727556412391765, "rougeLsum_recall": 0.23296549496286276, "rougeLsum_recall_stderr": 0.0022508634599894047}}, "3": {"tldr_en": {"bleu": 1.4062950983407798, "bleu_stderr": 0.07472781342508986, "rouge1_fmeasure": 0.14810489931337875, "rouge1_fmeasure_stderr": 0.0019491591145607235, "rouge1_precision": 0.13392581263119946, "rouge1_precision_stderr": 0.002140928944599871, "rouge1_recall": 0.2097792907880317, "rouge1_recall_stderr": 0.002761291880798377, "rouge2_fmeasure": 0.025634833926844586, "rouge2_fmeasure_stderr": 0.0007066867665308688, "rouge2_precision": 0.022971374934634834, "rouge2_precision_stderr": 0.0007047500533679457, "rouge2_recall": 0.03678938166045541, "rouge2_recall_stderr": 0.0011349772721473599, "rougeL_fmeasure": 0.10961401503164327, "rougeL_fmeasure_stderr": 0.0013647661962651585, "rougeL_precision": 0.09850658201165036, "rougeL_precision_stderr": 0.0015659357580403235, "rougeL_recall": 0.1596109287842759, "rougeL_recall_stderr": 0.002148893777925733, "rougeLsum_fmeasure": 0.13871487249420744, "rougeLsum_fmeasure_stderr": 0.001817834857099676, "rougeLsum_precision": 0.12550959702439043, "rougeLsum_precision_stderr": 0.0020164296862764117, "rougeLsum_recall": 0.19677992756511345, "rougeLsum_recall_stderr": 0.0025870041574663997}}, "4": {"tldr_en": {"bleu": 0.32122596628272515, "bleu_stderr": 0.03193872312549078, "rouge1_fmeasure": 0.04793358411626256, "rouge1_fmeasure_stderr": 0.0016577434915373563, "rouge1_precision": 0.043698221541070816, "rouge1_precision_stderr": 0.0016573598268591304, "rouge1_recall": 0.0701742995421549, "rouge1_recall_stderr": 0.002428634588327357, "rouge2_fmeasure": 0.008539104174148937, "rouge2_fmeasure_stderr": 0.0004647990720931085, "rouge2_precision": 0.007826164596892957, "rouge2_precision_stderr": 0.0004736357889751423, "rouge2_recall": 0.0123571530969829, "rouge2_recall_stderr": 0.0007082929687216108, "rougeL_fmeasure": 0.03708693999869154, "rougeL_fmeasure_stderr": 0.0012479702208301237, "rougeL_precision": 0.033718320307617386, "rougeL_precision_stderr": 0.0012678171126893336, "rougeL_recall": 0.05562082855363565, "rougeL_recall_stderr": 0.0019259943013379022, "rougeLsum_fmeasure": 0.045272040205073305, "rougeLsum_fmeasure_stderr": 0.001561249371622721, "rougeLsum_precision": 0.0412679046963365, "rougeLsum_precision_stderr": 0.0015660958093523065, "rougeLsum_recall": 0.06646857033967096, "rougeLsum_recall_stderr": 0.0022974274036754406}}, "5": {"tldr_en": {"bleu": 3.734630973276787e-07, "bleu_stderr": 6.891380964274011e-07, "rouge1_fmeasure": 0.00820355499814886, "rouge1_fmeasure_stderr": 0.000778514691351787, "rouge1_precision": 0.007856946302176288, "rouge1_precision_stderr": 0.0008225817689834667, "rouge1_recall": 0.01207114013415243, "rouge1_recall_stderr": 0.0011465054551796565, "rouge2_fmeasure": 0.0015648741591109756, "rouge2_fmeasure_stderr": 0.00021569980665948175, "rouge2_precision": 0.0015120926353582244, "rouge2_precision_stderr": 0.0002602535681751426, "rouge2_recall": 0.0022671439397422507, "rouge2_recall_stderr": 0.0003110901669574075, "rougeL_fmeasure": 0.00616295671829818, "rougeL_fmeasure_stderr": 0.0005709775100680275, "rougeL_precision": 0.005886048362481323, "rougeL_precision_stderr": 0.0006067939277130283, "rougeL_recall": 0.009251431512766469, "rougeL_recall_stderr": 0.0008728854881788707, "rougeLsum_fmeasure": 0.007706899557082421, "rougeLsum_fmeasure_stderr": 0.0007302491686647032, "rougeLsum_precision": 0.007288999163071736, "rougeLsum_precision_stderr": 0.0007514803919271913, "rougeLsum_recall": 0.011443995306376289, "rougeLsum_recall_stderr": 0.0010913578260592406}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.33057623832858146, "bleu_stderr": 0.03984643570206641, "rouge1_fmeasure": 0.09646457743499849, "rouge1_fmeasure_stderr": 0.00129482016069277, "rouge1_precision": 0.09361107563722128, "rouge1_precision_stderr": 0.001407847305983867, "rouge1_recall": 0.118871956787488, "rouge1_recall_stderr": 0.0017922313238890985, "rouge2_fmeasure": 0.006146989261393382, "rouge2_fmeasure_stderr": 0.00034538440130717945, "rouge2_precision": 0.005525861165016361, "rouge2_precision_stderr": 0.00034328183329122446, "rouge2_recall": 0.008459590079107696, "rouge2_recall_stderr": 0.0004901694379117778, "rougeL_fmeasure": 0.07833870274147303, "rougeL_fmeasure_stderr": 0.000977745299752495, "rougeL_precision": 0.07684665378040627, "rougeL_precision_stderr": 0.001144151928375766, "rougeL_recall": 0.09646289172478788, "rougeL_recall_stderr": 0.001393435978704278, "rougeLsum_fmeasure": 0.08574260456706272, "rougeLsum_fmeasure_stderr": 0.0011232552597538695, "rougeLsum_precision": 0.08393090912824838, "rougeLsum_precision_stderr": 0.0012678818997769541, "rougeLsum_recall": 0.10497927092248187, "rougeLsum_recall_stderr": 0.0015417063112072164}}, "1": {"generate_text_restaurant": {"bleu": 4.530972728578894, "bleu_stderr": 0.0696696215830592, "rouge1_fmeasure": 0.2680705216946067, "rouge1_fmeasure_stderr": 0.002169718312369923, "rouge1_precision": 0.26342677667409126, "rouge1_precision_stderr": 0.0027722778645579973, "rouge1_recall": 0.36668209594324785, "rouge1_recall_stderr": 0.0032236568683673083, "rouge2_fmeasure": 0.0931934839990174, "rouge2_fmeasure_stderr": 0.0013022057379119408, "rouge2_precision": 0.09076183803995215, "rouge2_precision_stderr": 0.0019406771010987208, "rouge2_recall": 0.13093605381336618, "rouge2_recall_stderr": 0.0018781901035621468, "rougeL_fmeasure": 0.19827432994801306, "rougeL_fmeasure_stderr": 0.0015203602465609666, "rougeL_precision": 0.20287484788272048, "rougeL_precision_stderr": 0.002489617725388583, "rougeL_recall": 0.2720670406083809, "rougeL_recall_stderr": 0.0024480786650828788, "rougeLsum_fmeasure": 0.2264004289052051, "rougeLsum_fmeasure_stderr": 0.0019007688550298912, "rougeLsum_precision": 0.2276790296025434, "rougeLsum_precision_stderr": 0.0026503966610546413, "rougeLsum_recall": 0.30878667679727767, "rougeLsum_recall_stderr": 0.002847435111528069}}, "2": {"generate_text_restaurant": {"bleu": 5.145577986524896, "bleu_stderr": 0.10889471407623343, "rouge1_fmeasure": 0.28244486105155053, "rouge1_fmeasure_stderr": 0.002044191586485003, "rouge1_precision": 0.25622125083481745, "rouge1_precision_stderr": 0.002465641361783546, "rouge1_recall": 0.39474743607983587, "rouge1_recall_stderr": 0.002930791198458831, "rouge2_fmeasure": 0.10811536820309103, "rouge2_fmeasure_stderr": 0.0012813392182682906, "rouge2_precision": 0.09469321139330758, "rouge2_precision_stderr": 0.0014641639934404919, "rouge2_recall": 0.15515662167719996, "rouge2_recall_stderr": 0.001877110800933785, "rougeL_fmeasure": 0.20954491982888515, "rougeL_fmeasure_stderr": 0.0014552615577714736, "rougeL_precision": 0.19389989577226457, "rougeL_precision_stderr": 0.0020671723239624284, "rougeL_recall": 0.29505332724032807, "rougeL_recall_stderr": 0.0023004162214782173, "rougeLsum_fmeasure": 0.23983439495330813, "rougeLsum_fmeasure_stderr": 0.0018038088455233146, "rougeLsum_precision": 0.22006636679032754, "rougeLsum_precision_stderr": 0.002277981416211695, "rougeLsum_recall": 0.3353592051253118, "rougeLsum_recall_stderr": 0.002651592118498445}}, "3": {"generate_text_restaurant": {"bleu": 5.351176959578303, "bleu_stderr": 0.08807326329086133, "rouge1_fmeasure": 0.2893170367492987, "rouge1_fmeasure_stderr": 0.0019292923674252467, "rouge1_precision": 0.2509843525625443, "rouge1_precision_stderr": 0.0022795153105266796, "rouge1_recall": 0.41075769919962274, "rouge1_recall_stderr": 0.0027215172691620946, "rouge2_fmeasure": 0.11497167650749299, "rouge2_fmeasure_stderr": 0.0012473504111135097, "rouge2_precision": 0.09808500774089142, "rouge2_precision_stderr": 0.001373720697120782, "rouge2_recall": 0.16686582229173133, "rouge2_recall_stderr": 0.0018178589205108053, "rougeL_fmeasure": 0.21358673436927297, "rougeL_fmeasure_stderr": 0.001413456063914363, "rougeL_precision": 0.18662529779347997, "rougeL_precision_stderr": 0.0017938765662735545, "rougeL_recall": 0.305779323504594, "rougeL_recall_stderr": 0.002189452828807476, "rougeLsum_fmeasure": 0.2466444374222866, "rougeLsum_fmeasure_stderr": 0.0017509965973494443, "rougeLsum_precision": 0.21470439372529787, "rougeLsum_precision_stderr": 0.0020492866729247545, "rougeLsum_recall": 0.3506926104456191, "rougeLsum_recall_stderr": 0.002532701405656143}}, "4": {"generate_text_restaurant": {"bleu": 5.538049705153007, "bleu_stderr": 0.08875181059383651, "rouge1_fmeasure": 0.2937308856610805, "rouge1_fmeasure_stderr": 0.0018562529585398333, "rouge1_precision": 0.25061368438882914, "rouge1_precision_stderr": 0.002176339551089046, "rouge1_recall": 0.4167854507341472, "rouge1_recall_stderr": 0.0026117920434684887, "rouge2_fmeasure": 0.11851800279247307, "rouge2_fmeasure_stderr": 0.0012274464082428587, "rouge2_precision": 0.09927467532176347, "rouge2_precision_stderr": 0.0013166755796873335, "rouge2_recall": 0.17178023783886964, "rouge2_recall_stderr": 0.001784972342614204, "rougeL_fmeasure": 0.21682026198985094, "rougeL_fmeasure_stderr": 0.0014038416501564435, "rougeL_precision": 0.18600435503892587, "rougeL_precision_stderr": 0.001740274021720506, "rougeL_recall": 0.3097896489834742, "rougeL_recall_stderr": 0.0021285131025981004, "rougeLsum_fmeasure": 0.25024672301095463, "rougeLsum_fmeasure_stderr": 0.0017074046381346346, "rougeLsum_precision": 0.2141173436775283, "rougeLsum_precision_stderr": 0.001970588251595052, "rougeLsum_recall": 0.3552224568252288, "rougeLsum_recall_stderr": 0.0024266755082468245}}, "5": {"generate_text_restaurant": {"bleu": 5.413211198654327, "bleu_stderr": 0.11926917918560406, "rouge1_fmeasure": 0.29170879524664495, "rouge1_fmeasure_stderr": 0.0017935485556795942, "rouge1_precision": 0.24308016477845365, "rouge1_precision_stderr": 0.002025495745571956, "rouge1_recall": 0.41988865569638695, "rouge1_recall_stderr": 0.0025300858102015942, "rouge2_fmeasure": 0.11692418277126329, "rouge2_fmeasure_stderr": 0.0011909498622458903, "rouge2_precision": 0.09635018984873049, "rouge2_precision_stderr": 0.0011994379502815046, "rouge2_recall": 0.17132121420569668, "rouge2_recall_stderr": 0.0017331484110346491, "rougeL_fmeasure": 0.2138612178730655, "rougeL_fmeasure_stderr": 0.0013838503410661576, "rougeL_precision": 0.1785481956428259, "rougeL_precision_stderr": 0.0015856362953524012, "rougeL_recall": 0.3100622585798036, "rougeL_recall_stderr": 0.0020920393648656602, "rougeLsum_fmeasure": 0.24941629556984113, "rougeLsum_fmeasure_stderr": 0.0016571992189944708, "rougeLsum_precision": 0.20806711230127614, "rougeLsum_precision_stderr": 0.0018292236129827954, "rougeLsum_recall": 0.35942058024604345, "rougeLsum_recall_stderr": 0.0023737768837312633}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0600366791958193, "bleu_stderr": 0.08600237771791341, "rouge1_fmeasure": 0.18195886211014126, "rouge1_fmeasure_stderr": 0.0022402158625493564, "rouge1_precision": 0.13435472927147493, "rouge1_precision_stderr": 0.0018578288644047064, "rouge1_recall": 0.3064849498297693, "rouge1_recall_stderr": 0.003795252172659445, "rouge2_fmeasure": 0.03158604815509799, "rouge2_fmeasure_stderr": 0.0011464678637717971, "rouge2_precision": 0.02282389146066939, "rouge2_precision_stderr": 0.000848692744901622, "rouge2_recall": 0.05494938986520129, "rouge2_recall_stderr": 0.0020307872153233868, "rougeL_fmeasure": 0.13550808265300648, "rougeL_fmeasure_stderr": 0.0016201019441011823, "rougeL_precision": 0.09988815515390925, "rougeL_precision_stderr": 0.0013428280773152147, "rougeL_recall": 0.22929947251941216, "rougeL_recall_stderr": 0.002815195171249488, "rougeLsum_fmeasure": 0.14292063646408693, "rougeLsum_fmeasure_stderr": 0.0018164038774492516, "rougeLsum_precision": 0.10509976393025357, "rougeLsum_precision_stderr": 0.0014598852978289902, "rougeLsum_recall": 0.2425540245465752, "rougeLsum_recall_stderr": 0.00322225885216011}}, "1": {"article_DOC_summary": {"bleu": 0.7912865307600689, "bleu_stderr": 0.08293001880734589, "rouge1_fmeasure": 0.15582763434133562, "rouge1_fmeasure_stderr": 0.002018414756290501, "rouge1_precision": 0.11064657354189784, "rouge1_precision_stderr": 0.00151842139941034, "rouge1_recall": 0.2743434473621011, "rouge1_recall_stderr": 0.003357809294550975, "rouge2_fmeasure": 0.022348403259935444, "rouge2_fmeasure_stderr": 0.001007611674053319, "rouge2_precision": 0.015762873264525114, "rouge2_precision_stderr": 0.0007168463121213941, "rouge2_recall": 0.04002056277766429, "rouge2_recall_stderr": 0.0018098696341123717, "rougeL_fmeasure": 0.11882335449816735, "rougeL_fmeasure_stderr": 0.0014958515133636225, "rougeL_precision": 0.08418214038966489, "rougeL_precision_stderr": 0.0011141701968789468, "rougeL_recall": 0.2106611900789871, "rougeL_recall_stderr": 0.002607033509434174, "rougeLsum_fmeasure": 0.12755589519657246, "rougeLsum_fmeasure_stderr": 0.0016643755278025376, "rougeLsum_precision": 0.09046238615369645, "rougeLsum_precision_stderr": 0.0012434437678640776, "rougeLsum_recall": 0.2254795341945721, "rougeLsum_recall_stderr": 0.00284539636619007}}, "2": {"article_DOC_summary": {"bleu": 0.7304297768472815, "bleu_stderr": 0.06751105754043198, "rouge1_fmeasure": 0.14869343126177825, "rouge1_fmeasure_stderr": 0.00189722263525614, "rouge1_precision": 0.10529230902317702, "rouge1_precision_stderr": 0.0014213563350607538, "rouge1_recall": 0.26340923995527166, "rouge1_recall_stderr": 0.0031984184553666478, "rouge2_fmeasure": 0.01989415677106089, "rouge2_fmeasure_stderr": 0.0009138512512834885, "rouge2_precision": 0.013964349604668382, "rouge2_precision_stderr": 0.0006436254907378268, "rouge2_recall": 0.036126383134734597, "rouge2_recall_stderr": 0.0017090930398331523, "rougeL_fmeasure": 0.1154144287891115, "rougeL_fmeasure_stderr": 0.0014563126208745625, "rougeL_precision": 0.08156379215031906, "rougeL_precision_stderr": 0.0010781341247311795, "rougeL_recall": 0.20577593365587615, "rougeL_recall_stderr": 0.0025743104887950294, "rougeLsum_fmeasure": 0.12153146911034832, "rougeLsum_fmeasure_stderr": 0.0015573728905659264, "rougeLsum_precision": 0.08590505180979985, "rougeLsum_precision_stderr": 0.001156796616272971, "rougeLsum_recall": 0.21659012592548507, "rougeLsum_recall_stderr": 0.002722603241756008}}, "3": {"article_DOC_summary": {"bleu": 0.7329018112338058, "bleu_stderr": 0.08380192291775194, "rouge1_fmeasure": 0.14275992296440837, "rouge1_fmeasure_stderr": 0.002051195848546264, "rouge1_precision": 0.10347805202602275, "rouge1_precision_stderr": 0.0016249970300677694, "rouge1_recall": 0.2478957802804731, "rouge1_recall_stderr": 0.0035092046078961637, "rouge2_fmeasure": 0.01877267165553685, "rouge2_fmeasure_stderr": 0.0009292434072393322, "rouge2_precision": 0.013389288853013175, "rouge2_precision_stderr": 0.000674911659994626, "rouge2_recall": 0.0338213807917493, "rouge2_recall_stderr": 0.0017212461586859492, "rougeL_fmeasure": 0.11012753487421016, "rougeL_fmeasure_stderr": 0.0015088757359987108, "rougeL_precision": 0.07974633787988096, "rougeL_precision_stderr": 0.0012037620202931326, "rougeL_recall": 0.1923594052170303, "rougeL_recall_stderr": 0.00270302170810256, "rougeLsum_fmeasure": 0.11706693412832299, "rougeLsum_fmeasure_stderr": 0.0016978755783353637, "rougeLsum_precision": 0.08476633602783612, "rougeLsum_precision_stderr": 0.0013528627031471775, "rougeLsum_recall": 0.20422079804996465, "rougeLsum_recall_stderr": 0.0029866505943345317}}, "4": {"article_DOC_summary": {"bleu": 0.3635455323887453, "bleu_stderr": 0.06961392400107996, "rouge1_fmeasure": 0.03914885118279171, "rouge1_fmeasure_stderr": 0.0021518975935630276, "rouge1_precision": 0.0336357304836406, "rouge1_precision_stderr": 0.0021254922215637635, "rouge1_recall": 0.06247564004887155, "rouge1_recall_stderr": 0.003526042418026675, "rouge2_fmeasure": 0.0046528406004404926, "rouge2_fmeasure_stderr": 0.0005326361454877763, "rouge2_precision": 0.0035242575838239053, "rouge2_precision_stderr": 0.00042797324328468125, "rouge2_recall": 0.008157735206889534, "rouge2_recall_stderr": 0.0009536882494756777, "rougeL_fmeasure": 0.030138883372020763, "rougeL_fmeasure_stderr": 0.001635026912796053, "rougeL_precision": 0.026581235225849098, "rougeL_precision_stderr": 0.0017998368418240566, "rougeL_recall": 0.048243865226650404, "rougeL_recall_stderr": 0.0027156158644718745, "rougeLsum_fmeasure": 0.032257299613414364, "rougeLsum_fmeasure_stderr": 0.0017540240953433808, "rougeLsum_precision": 0.02809742134075658, "rougeLsum_precision_stderr": 0.0018521710838069934, "rougeLsum_recall": 0.05191151500212486, "rougeLsum_recall_stderr": 0.002943341388673509}}, "5": {"article_DOC_summary": {"bleu": 5.511259666473159e-41, "bleu_stderr": 5.35057924777955e-35, "rouge1_fmeasure": 0.002355096865323314, "rouge1_fmeasure_stderr": 0.0006477286811702701, "rouge1_precision": 0.0026592811148713944, "rouge1_precision_stderr": 0.0007347613362864036, "rouge1_recall": 0.0021940356252624156, "rouge1_recall_stderr": 0.0006082404524842467, "rouge2_fmeasure": 0.00020335003485991417, "rouge2_fmeasure_stderr": 0.00010225485811445311, "rouge2_precision": 0.00022605954903402186, "rouge2_precision_stderr": 0.00011315061618655478, "rouge2_recall": 0.00018624087713987823, "rouge2_recall_stderr": 9.438134982347295e-05, "rougeL_fmeasure": 0.0016413300314297432, "rougeL_fmeasure_stderr": 0.0004516290431859227, "rougeL_precision": 0.0018699850768750823, "rougeL_precision_stderr": 0.0005179448567454989, "rougeL_recall": 0.0015107074139989849, "rougeL_recall_stderr": 0.00041579744590096776, "rougeLsum_fmeasure": 0.0017451369586540383, "rougeLsum_fmeasure_stderr": 0.0004913373774244189, "rougeLsum_precision": 0.0019867184483254347, "rougeLsum_precision_stderr": 0.0005620765129149496, "rougeLsum_recall": 0.0016072239784726086, "rougeLsum_recall_stderr": 0.0004531432979747701}}}} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_0.csv b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..83dffb65a23c525b39843784aafd752e0e457fb6 --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.01470919305605713,0 +anli_r2,acc,0.349,0.015080663991563097,0 +anli_r3,acc,0.36083333333333334,0.013869180252444862,0 +arc_challenge,acc,0.2226962457337884,0.012158314774829917,0 +arc_challenge,acc_norm,0.25597269624573377,0.01275301324124452,0 +arc_easy,acc,0.4633838383838384,0.01023223506393303,0 +arc_easy,acc_norm,0.4217171717171717,0.010133255284012318,0 +boolq,acc,0.4972477064220184,0.008744922485713845,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.4304347826086956,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.37870942043417644,0.004840742206718102,0 +hellaswag,acc_norm,0.4556861183031269,0.004970145708188002,0 +piqa,acc,0.6860718171926007,0.010827928134189643,0 +piqa,acc_norm,0.6985854189336235,0.01070624824275376,0 +rte,acc,0.5595667870036101,0.029882123363118723,0 +sciq,acc,0.674,0.014830507204541037,0 +sciq,acc_norm,0.623,0.01533317012577986,0 +storycloze_2016,acc,0.6461785141635489,0.011057260832171067,0 +winogrande,acc,0.5090765588003157,0.014050170094497704,0 diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_0_lm-eval_global_step52452_2023-02-11-12-41-26_0shots_backup.json b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_0_lm-eval_global_step52452_2023-02-11-12-41-26_0shots_backup.json deleted file mode 100644 index f1fc5cce848374c3533980de5d7dd45e67b65876..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_0_lm-eval_global_step52452_2023-02-11-12-41-26_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.01470919305605713 - }, - "anli_r2": { - "acc": 0.349, - "acc_stderr": 0.015080663991563097 - }, - "anli_r3": { - "acc": 0.36083333333333334, - "acc_stderr": 0.013869180252444862 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.4304347826086956 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.37870942043417644, - "acc_stderr": 0.004840742206718102, - "acc_norm": 0.4556861183031269, - "acc_norm_stderr": 0.004970145708188002 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118723 - }, - "winogrande": { - "acc": 0.5090765588003157, - "acc_stderr": 0.014050170094497704 - }, - "storycloze_2016": { - "acc": 0.6461785141635489, - "acc_stderr": 0.011057260832171067 - }, - "boolq": { - "acc": 0.4972477064220184, - "acc_stderr": 0.008744922485713845 - }, - "arc_easy": { - "acc": 0.4633838383838384, - "acc_stderr": 0.01023223506393303, - "acc_norm": 0.4217171717171717, - "acc_norm_stderr": 0.010133255284012318 - }, - "arc_challenge": { - "acc": 0.2226962457337884, - "acc_stderr": 0.012158314774829917, - "acc_norm": 0.25597269624573377, - "acc_norm_stderr": 0.01275301324124452 - }, - "sciq": { - "acc": 0.674, - "acc_stderr": 0.014830507204541037, - "acc_norm": 0.623, - "acc_norm_stderr": 0.01533317012577986 - }, - "piqa": { - "acc": 0.6860718171926007, - "acc_stderr": 0.010827928134189643, - "acc_norm": 0.6985854189336235, - "acc_norm_stderr": 0.01070624824275376 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_1.csv b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..bf31c492e8f4b0a3e269e939c1ec8ac6ceb718c2 --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.32,0.01475865230357488,0 +anli_r2,acc,0.334,0.01492201952373296,0 +anli_r3,acc,0.3425,0.013704669762934725,0 +arc_challenge,acc,0.21160409556313994,0.011935916358632859,0 +arc_challenge,acc_norm,0.24658703071672355,0.012595726268790132,0 +arc_easy,acc,0.4722222222222222,0.010243938285881115,0 +arc_easy,acc_norm,0.44234006734006737,0.01019133444422086,0 +boolq,acc,0.5045871559633027,0.008744686941762905,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.28801528801528803,,1 +copa,acc,0.64,0.048241815132442176,0 +hellaswag,acc,0.377912766381199,0.004838747305783333,0 +hellaswag,acc_norm,0.4538936466839275,0.0049685216080654635,0 +piqa,acc,0.6855277475516867,0.010833009065106572,0 +piqa,acc_norm,0.690424374319913,0.010786656752183345,0 +rte,acc,0.5667870036101083,0.029826764082138274,0 +sciq,acc,0.68,0.014758652303574885,0 +sciq,acc_norm,0.682,0.0147340793093119,0 +storycloze_2016,acc,0.638161411010155,0.011112247531047543,0 +winogrande,acc,0.5090765588003157,0.014050170094497712,0 diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json deleted file mode 100644 index 0f13106e0682ba3ee03e6aeb0d806a909533d619..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.32, - "acc_stderr": 0.01475865230357488 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.01492201952373296 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934725 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.28801528801528803 - }, - "copa": { - "acc": 0.64, - "acc_stderr": 0.048241815132442176 - }, - "hellaswag": { - "acc": 0.377912766381199, - "acc_stderr": 0.004838747305783333, - "acc_norm": 0.4538936466839275, - "acc_norm_stderr": 0.0049685216080654635 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.029826764082138274 - }, - "winogrande": { - "acc": 0.5090765588003157, - "acc_stderr": 0.014050170094497712 - }, - "storycloze_2016": { - "acc": 0.638161411010155, - "acc_stderr": 0.011112247531047543 - }, - "boolq": { - "acc": 0.5045871559633027, - "acc_stderr": 0.008744686941762905 - }, - "arc_easy": { - "acc": 0.4722222222222222, - "acc_stderr": 0.010243938285881115, - "acc_norm": 0.44234006734006737, - "acc_norm_stderr": 0.01019133444422086 - }, - "arc_challenge": { - "acc": 0.21160409556313994, - "acc_stderr": 0.011935916358632859, - "acc_norm": 0.24658703071672355, - "acc_norm_stderr": 0.012595726268790132 - }, - "sciq": { - "acc": 0.68, - "acc_stderr": 0.014758652303574885, - "acc_norm": 0.682, - "acc_norm_stderr": 0.0147340793093119 - }, - "piqa": { - "acc": 0.6855277475516867, - "acc_stderr": 0.010833009065106572, - "acc_norm": 0.690424374319913, - "acc_norm_stderr": 0.010786656752183345 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_2.csv b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..93164de0d80e3b6b05946488ae32630f8a2c2794 --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.01488827258820393,0 +anli_r2,acc,0.346,0.015050266127564434,0 +anli_r3,acc,0.35,0.013774667009018554,0 +arc_challenge,acc,0.2167235494880546,0.012040156713481192,0 +arc_challenge,acc_norm,0.2593856655290102,0.012808273573927097,0 +arc_easy,acc,0.4739057239057239,0.010245801990240047,0 +arc_easy,acc_norm,0.46085858585858586,0.01022829820076612,0 +boolq,acc,0.5146788990825688,0.00874128556866792,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.28381437464629317,,1 +copa,acc,0.64,0.04824181513244218,0 +hellaswag,acc,0.37492531368253335,0.004831142570475513,0 +hellaswag,acc_norm,0.44851623182632944,0.004963259311700551,0 +piqa,acc,0.6844396082698585,0.01084311920175894,0 +piqa,acc_norm,0.6871599564744287,0.010817714425701088,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.688,0.01465847437050901,0 +sciq,acc_norm,0.679,0.014770821817934637,0 +storycloze_2016,acc,0.6419027258150721,0.01108700680992571,0 +winogrande,acc,0.5122336227308603,0.01404827882040562,0 diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json deleted file mode 100644 index e059b83297135c64fe27031ec74b0c6cf6bf5652..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.01488827258820393 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564434 - }, - "anli_r3": { - "acc": 0.35, - "acc_stderr": 0.013774667009018554 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.28381437464629317 - }, - "copa": { - "acc": 0.64, - "acc_stderr": 0.04824181513244218 - }, - "hellaswag": { - "acc": 0.37492531368253335, - "acc_stderr": 0.004831142570475513, - "acc_norm": 0.44851623182632944, - "acc_norm_stderr": 0.004963259311700551 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.5122336227308603, - "acc_stderr": 0.01404827882040562 - }, - "storycloze_2016": { - "acc": 0.6419027258150721, - "acc_stderr": 0.01108700680992571 - }, - "boolq": { - "acc": 0.5146788990825688, - "acc_stderr": 0.00874128556866792 - }, - "arc_easy": { - "acc": 0.4739057239057239, - "acc_stderr": 0.010245801990240047, - "acc_norm": 0.46085858585858586, - "acc_norm_stderr": 0.01022829820076612 - }, - "arc_challenge": { - "acc": 0.2167235494880546, - "acc_stderr": 0.012040156713481192, - "acc_norm": 0.2593856655290102, - "acc_norm_stderr": 0.012808273573927097 - }, - "sciq": { - "acc": 0.688, - "acc_stderr": 0.01465847437050901, - "acc_norm": 0.679, - "acc_norm_stderr": 0.014770821817934637 - }, - "piqa": { - "acc": 0.6844396082698585, - "acc_stderr": 0.01084311920175894, - "acc_norm": 0.6871599564744287, - "acc_norm_stderr": 0.010817714425701088 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_3.csv b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..2df4b67dae30c6f5ed5cf25400e094b128e5832f --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.01484221315341124,0 +anli_r2,acc,0.345,0.015039986742055237,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.2175767918088737,0.012057262020972497,0 +arc_challenge,acc_norm,0.257679180887372,0.01278077056276841,0 +arc_easy,acc,0.4604377104377104,0.01022761638628901,0 +arc_easy,acc_norm,0.4503367003367003,0.010209047724374165,0 +boolq,acc,0.5318042813455658,0.008727345583419182,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.2884770346494762,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.3730332603067118,0.004826224784850451,0 +hellaswag,acc_norm,0.45180242979486157,0.004966544724452224,0 +piqa,acc,0.6882480957562568,0.010807431424873674,0 +piqa,acc_norm,0.691512513601741,0.010776164678037155,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.698,0.014526080235459543,0 +sciq,acc_norm,0.691,0.014619600977206488,0 +storycloze_2016,acc,0.6360235168359166,0.011126343044992834,0 +winogrande,acc,0.5177584846093133,0.014043619596174964,0 diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json deleted file mode 100644 index b224dc135fefd005d0dc09a53e809deef4eb7988..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.01484221315341124 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.2884770346494762 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - }, - "hellaswag": { - "acc": 0.3730332603067118, - "acc_stderr": 0.004826224784850451, - "acc_norm": 0.45180242979486157, - "acc_norm_stderr": 0.004966544724452224 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.5177584846093133, - "acc_stderr": 0.014043619596174964 - }, - "storycloze_2016": { - "acc": 0.6360235168359166, - "acc_stderr": 0.011126343044992834 - }, - "boolq": { - "acc": 0.5318042813455658, - "acc_stderr": 0.008727345583419182 - }, - "arc_easy": { - "acc": 0.4604377104377104, - "acc_stderr": 0.01022761638628901, - "acc_norm": 0.4503367003367003, - "acc_norm_stderr": 0.010209047724374165 - }, - "arc_challenge": { - "acc": 0.2175767918088737, - "acc_stderr": 0.012057262020972497, - "acc_norm": 0.257679180887372, - "acc_norm_stderr": 0.01278077056276841 - }, - "sciq": { - "acc": 0.698, - "acc_stderr": 0.014526080235459543, - "acc_norm": 0.691, - "acc_norm_stderr": 0.014619600977206488 - }, - "piqa": { - "acc": 0.6882480957562568, - "acc_stderr": 0.010807431424873674, - "acc_norm": 0.691512513601741, - "acc_norm_stderr": 0.010776164678037155 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_4.csv b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..e54ff9c7a164decf5814abc1a6275de312e87bb2 --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057123,0 +anli_r2,acc,0.352,0.015110404505648668,0 +anli_r3,acc,0.3591666666666667,0.013855141559780354,0 +arc_challenge,acc,0.22866894197952217,0.012272853582540806,0 +arc_challenge,acc_norm,0.2440273037542662,0.01255144762785626,0 +arc_easy,acc,0.4692760942760943,0.010240395584815237,0 +arc_easy,acc_norm,0.4541245791245791,0.010216507710244096,0 +boolq,acc,0.5336391437308868,0.008725240861131139,1 +cb,acc,0.5892857142857143,0.06633634150359538,1 +cb,f1,0.37564837564837555,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.3721370244971121,0.0048238677613324675,0 +hellaswag,acc_norm,0.44652459669388567,0.004961161589228418,0 +piqa,acc,0.6817192600652884,0.010868093932082233,0 +piqa,acc_norm,0.6844396082698585,0.010843119201758945,0 +rte,acc,0.5379061371841155,0.030009848912529113,0 +sciq,acc,0.699,0.01451239503354315,0 +sciq,acc_norm,0.685,0.014696631960792498,0 +storycloze_2016,acc,0.638161411010155,0.011112247531047545,0 +winogrande,acc,0.5082872928176796,0.014050555322824185,0 diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_4_lm-eval_global_step52452_2023-02-11-12-41-26_4shots_backup.json b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_4_lm-eval_global_step52452_2023-02-11-12-41-26_4shots_backup.json deleted file mode 100644 index f683963cb90bdea733aaaf9a632b6e6b9312c1e1..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_4_lm-eval_global_step52452_2023-02-11-12-41-26_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057123 - }, - "anli_r2": { - "acc": 0.352, - "acc_stderr": 0.015110404505648668 - }, - "anli_r3": { - "acc": 0.3591666666666667, - "acc_stderr": 0.013855141559780354 - }, - "cb": { - "acc": 0.5892857142857143, - "acc_stderr": 0.06633634150359538, - "f1": 0.37564837564837555 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - }, - "hellaswag": { - "acc": 0.3721370244971121, - "acc_stderr": 0.0048238677613324675, - "acc_norm": 0.44652459669388567, - "acc_norm_stderr": 0.004961161589228418 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.5082872928176796, - "acc_stderr": 0.014050555322824185 - }, - "storycloze_2016": { - "acc": 0.638161411010155, - "acc_stderr": 0.011112247531047545 - }, - "boolq": { - "acc": 0.5336391437308868, - "acc_stderr": 0.008725240861131139 - }, - "arc_easy": { - "acc": 0.4692760942760943, - "acc_stderr": 0.010240395584815237, - "acc_norm": 0.4541245791245791, - "acc_norm_stderr": 0.010216507710244096 - }, - "arc_challenge": { - "acc": 0.22866894197952217, - "acc_stderr": 0.012272853582540806, - "acc_norm": 0.2440273037542662, - "acc_norm_stderr": 0.01255144762785626 - }, - "sciq": { - "acc": 0.699, - "acc_stderr": 0.01451239503354315, - "acc_norm": 0.685, - "acc_norm_stderr": 0.014696631960792498 - }, - "piqa": { - "acc": 0.6817192600652884, - "acc_stderr": 0.010868093932082233, - "acc_norm": 0.6844396082698585, - "acc_norm_stderr": 0.010843119201758945 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_5.csv b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..b03fcc61d36a82ea4c5ddab440520bb687f40bc2 --- /dev/null +++ b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.358,0.015167928865407559,0 +anli_r2,acc,0.348,0.01507060460376841,0 +anli_r3,acc,0.35333333333333333,0.013804572162314933,0 +arc_challenge,acc,0.22866894197952217,0.012272853582540797,0 +arc_challenge,acc_norm,0.2551194539249147,0.012739038695202109,0 +arc_easy,acc,0.4642255892255892,0.01023348870972654,0 +arc_easy,acc_norm,0.44023569023569026,0.010186228624515653,0 +boolq,acc,0.5376146788990825,0.00872027373643368,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3272573280306378,,1 +copa,acc,0.64,0.04824181513244218,0 +hellaswag,acc,0.3732324238199562,0.0048267461608301875,0 +hellaswag,acc_norm,0.45289782911770565,0.004967591267557391,0 +piqa,acc,0.6838955386289445,0.010848148455700453,0 +piqa,acc_norm,0.6871599564744287,0.010817714425701086,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.7,0.014498627873361428,0 +sciq,acc_norm,0.703,0.014456832294801106,0 +storycloze_2016,acc,0.6365579903794762,0.01112284144205971,0 +winogrande,acc,0.510655090765588,0.014049294536290403,0 diff --git a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_5_lm-eval_global_step52452_2023-02-11-12-41-27_5shots_backup.json b/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_5_lm-eval_global_step52452_2023-02-11-12-41-27_5shots_backup.json deleted file mode 100644 index a2a6616c5407aea7e5eb48bee388e49a07bfec73..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed2/evaluation/rankeval/2b855b1b25c4seed2_5_lm-eval_global_step52452_2023-02-11-12-41-27_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.358, - "acc_stderr": 0.015167928865407559 - }, - "anli_r2": { - "acc": 0.348, - "acc_stderr": 0.01507060460376841 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314933 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3272573280306378 - }, - "copa": { - "acc": 0.64, - "acc_stderr": 0.04824181513244218 - }, - "hellaswag": { - "acc": 0.3732324238199562, - "acc_stderr": 0.0048267461608301875, - "acc_norm": 0.45289782911770565, - "acc_norm_stderr": 0.004967591267557391 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.510655090765588, - "acc_stderr": 0.014049294536290403 - }, - "storycloze_2016": { - "acc": 0.6365579903794762, - "acc_stderr": 0.01112284144205971 - }, - "boolq": { - "acc": 0.5376146788990825, - "acc_stderr": 0.00872027373643368 - }, - "arc_easy": { - "acc": 0.4642255892255892, - "acc_stderr": 0.01023348870972654, - "acc_norm": 0.44023569023569026, - "acc_norm_stderr": 0.010186228624515653 - }, - "arc_challenge": { - "acc": 0.22866894197952217, - "acc_stderr": 0.012272853582540797, - "acc_norm": 0.2551194539249147, - "acc_norm_stderr": 0.012739038695202109 - }, - "sciq": { - "acc": 0.7, - "acc_stderr": 0.014498627873361428, - "acc_norm": 0.703, - "acc_norm_stderr": 0.014456832294801106 - }, - "piqa": { - "acc": 0.6838955386289445, - "acc_stderr": 0.010848148455700453, - "acc_norm": 0.6871599564744287, - "acc_norm_stderr": 0.010817714425701086 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/generation/merged.csv b/2b855b1b25c4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..0464cefdb058565a078faccc1b8996999fb3f3da --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.04005478057090433 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.04005478057090433 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.10448941521238508 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.10448941521238508 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1270241766491202 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1270241766491202 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.13428696867360543 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.13428696867360543 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.13608583415947614 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.13608583415947614 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1340355684574405 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1340355684574405 +e2e_nlg_cleaned,5,average,multiple,0.11266279062048862 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03474841233857288 +gem_xsum,0,median,rouge2_fmeasure,0.03474841233857288 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.018575263603814434 +gem_xsum,1,median,rouge2_fmeasure,0.018575263603814434 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.01723734576335539 +gem_xsum,2,median,rouge2_fmeasure,0.01723734576335539 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.016445615664662447 +gem_xsum,3,median,rouge2_fmeasure,0.016445615664662447 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0032383129780917384 +gem_xsum,4,median,rouge2_fmeasure,0.0032383129780917384 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002575849033001904 +gem_xsum,5,median,rouge2_fmeasure,0.0002575849033001904 +gem_xsum,5,average,multiple,0.015083755875299512 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.035728816082453776 +web_nlg_en,0,median,rouge2_fmeasure,0.035728816082453776 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.025771315568028322 +web_nlg_en,1,median,rouge2_fmeasure,0.025771315568028322 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.027564458157093037 +web_nlg_en,2,median,rouge2_fmeasure,0.027564458157093037 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.02687536929190093 +web_nlg_en,3,median,rouge2_fmeasure,0.02687536929190093 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.027815336119813478 +web_nlg_en,4,median,rouge2_fmeasure,0.027815336119813478 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.026853052835336486 +web_nlg_en,5,median,rouge2_fmeasure,0.026853052835336486 +web_nlg_en,5,average,multiple,0.028434724675771003 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.025350280308615827 +wiki_lingua_en,0,median,rouge2_fmeasure,0.025350280308615827 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03120030466009696 +wiki_lingua_en,1,median,rouge2_fmeasure,0.03120030466009696 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03202152301642874 +wiki_lingua_en,2,median,rouge2_fmeasure,0.03202152301642874 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.027177168452482477 +wiki_lingua_en,3,median,rouge2_fmeasure,0.027177168452482477 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.009128003381940237 +wiki_lingua_en,4,median,rouge2_fmeasure,0.009128003381940237 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.00137107437347401 +wiki_lingua_en,5,median,rouge2_fmeasure,0.00137107437347401 +wiki_lingua_en,5,average,multiple,0.021041392365506374 diff --git a/2b855b1b25c4seed3/evaluation/generation/merged.json b/2b855b1b25c4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..c5ad0d37104ff571b959f446022ba402de8a9bdf --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32383357497113313, "bleu_stderr": 0.029046108804920062, "rouge1_fmeasure": 0.08867537209167725, "rouge1_fmeasure_stderr": 0.0020309041501027445, "rouge1_precision": 0.08547205143678895, "rouge1_precision_stderr": 0.0035462344010718056, "rouge1_recall": 0.2412304632079097, "rouge1_recall_stderr": 0.00462775372151338, "rouge2_fmeasure": 0.035728816082453776, "rouge2_fmeasure_stderr": 0.001288792365661457, "rouge2_precision": 0.03835539021999813, "rouge2_precision_stderr": 0.0026018461185974367, "rouge2_recall": 0.09736884649162177, "rouge2_recall_stderr": 0.0028279495697625776, "rougeL_fmeasure": 0.07933345352885332, "rougeL_fmeasure_stderr": 0.0017939860783004774, "rougeL_precision": 0.07734285672702573, "rougeL_precision_stderr": 0.0033268477097065353, "rougeL_recall": 0.21914794682841368, "rougeL_recall_stderr": 0.00427555518759079, "rougeLsum_fmeasure": 0.08094015478602756, "rougeLsum_fmeasure_stderr": 0.0018612550472719624, "rougeLsum_precision": 0.07921588269939506, "rougeLsum_precision_stderr": 0.0033960397433610546, "rougeLsum_recall": 0.21995254223247163, "rougeLsum_recall_stderr": 0.004212090340244044}}, "1": {"PALM_prompt": {"bleu": 0.27606720676379776, "bleu_stderr": 0.02567270897812953, "rouge1_fmeasure": 0.07652291048904163, "rouge1_fmeasure_stderr": 0.0015788509535207115, "rouge1_precision": 0.054233241344685094, "rouge1_precision_stderr": 0.0017175921662412952, "rouge1_recall": 0.25860610831175956, "rouge1_recall_stderr": 0.004716061858394238, "rouge2_fmeasure": 0.025771315568028322, "rouge2_fmeasure_stderr": 0.0008791155455381993, "rouge2_precision": 0.018742049018639517, "rouge2_precision_stderr": 0.0009887139343062825, "rouge2_recall": 0.08841051293879301, "rouge2_recall_stderr": 0.002610289561418993, "rougeL_fmeasure": 0.06562305953873357, "rougeL_fmeasure_stderr": 0.0013288046309722294, "rougeL_precision": 0.046848386792670414, "rougeL_precision_stderr": 0.0015498169350517588, "rougeL_recall": 0.2247124273465997, "rougeL_recall_stderr": 0.004090572189556204, "rougeLsum_fmeasure": 0.06991779644956571, "rougeLsum_fmeasure_stderr": 0.001442647279422012, "rougeLsum_precision": 0.04984444124006735, "rougeLsum_precision_stderr": 0.001629428324043986, "rougeLsum_recall": 0.23619119945138067, "rougeLsum_recall_stderr": 0.004269080349085859}}, "2": {"PALM_prompt": {"bleu": 0.27941962450024044, "bleu_stderr": 0.01593252904314325, "rouge1_fmeasure": 0.08144748456601718, "rouge1_fmeasure_stderr": 0.0015786308563713364, "rouge1_precision": 0.05737670094629646, "rouge1_precision_stderr": 0.0017484919226613787, "rouge1_recall": 0.27620421628220476, "rouge1_recall_stderr": 0.004695557871110082, "rouge2_fmeasure": 0.027564458157093037, "rouge2_fmeasure_stderr": 0.0009021190593866042, "rouge2_precision": 0.02008511812792404, "rouge2_precision_stderr": 0.0010301289603768043, "rouge2_recall": 0.098704868372938, "rouge2_recall_stderr": 0.002803182981399675, "rougeL_fmeasure": 0.06876836746756484, "rougeL_fmeasure_stderr": 0.0012839891635731196, "rougeL_precision": 0.048592262721012046, "rougeL_precision_stderr": 0.0014879079682169785, "rougeL_recall": 0.23736248249365288, "rougeL_recall_stderr": 0.004042561080475512, "rougeLsum_fmeasure": 0.07421362712033641, "rougeLsum_fmeasure_stderr": 0.0014082919722638784, "rougeLsum_precision": 0.0523658146406785, "rougeLsum_precision_stderr": 0.0015765607275456583, "rougeLsum_recall": 0.2533572380146754, "rougeLsum_recall_stderr": 0.0042952606729823916}}, "3": {"PALM_prompt": {"bleu": 0.3171970408758897, "bleu_stderr": 0.017426151872638067, "rouge1_fmeasure": 0.07964841752767432, "rouge1_fmeasure_stderr": 0.001446983625193749, "rouge1_precision": 0.053181290164006, "rouge1_precision_stderr": 0.0013033819726948207, "rouge1_recall": 0.27653424880189637, "rouge1_recall_stderr": 0.004735914425800043, "rouge2_fmeasure": 0.02687536929190093, "rouge2_fmeasure_stderr": 0.0007993720138683921, "rouge2_precision": 0.01791860534191511, "rouge2_precision_stderr": 0.000676952693842271, "rouge2_recall": 0.09973868252702602, "rouge2_recall_stderr": 0.0028583962586225054, "rougeL_fmeasure": 0.06737223953732899, "rougeL_fmeasure_stderr": 0.0011754200304422387, "rougeL_precision": 0.04493654346005872, "rougeL_precision_stderr": 0.0010787781593833952, "rougeL_recall": 0.23792045342184467, "rougeL_recall_stderr": 0.004056403170683843, "rougeLsum_fmeasure": 0.07298692518348078, "rougeLsum_fmeasure_stderr": 0.0013169510058133435, "rougeLsum_precision": 0.048833729202093897, "rougeLsum_precision_stderr": 0.0011907253374175474, "rougeLsum_recall": 0.2531979464952816, "rougeLsum_recall_stderr": 0.004297406967512391}}, "4": {"PALM_prompt": {"bleu": 0.34023697487919763, "bleu_stderr": 0.018775432398829513, "rouge1_fmeasure": 0.08193853265075213, "rouge1_fmeasure_stderr": 0.0015174259582001043, "rouge1_precision": 0.05647836093853172, "rouge1_precision_stderr": 0.0017737356774995025, "rouge1_recall": 0.2852997799671812, "rouge1_recall_stderr": 0.004870712237798475, "rouge2_fmeasure": 0.027815336119813478, "rouge2_fmeasure_stderr": 0.0008738569376365014, "rouge2_precision": 0.019966015896233587, "rouge2_precision_stderr": 0.0011369698857611513, "rouge2_recall": 0.10382156039649827, "rouge2_recall_stderr": 0.0029649251113333572, "rougeL_fmeasure": 0.06837794366014814, "rougeL_fmeasure_stderr": 0.0012217869742530332, "rougeL_precision": 0.04745243462784577, "rougeL_precision_stderr": 0.001570420893755352, "rougeL_recall": 0.24286070752329061, "rougeL_recall_stderr": 0.004163504916615855, "rougeLsum_fmeasure": 0.07427281676174251, "rougeLsum_fmeasure_stderr": 0.0013412792899091318, "rougeLsum_precision": 0.05134007627756032, "rougeLsum_precision_stderr": 0.0016315343726760047, "rougeLsum_recall": 0.25996425773430787, "rougeLsum_recall_stderr": 0.0043995446381541}}, "5": {"PALM_prompt": {"bleu": 0.3145033709972809, "bleu_stderr": 0.02263346227282073, "rouge1_fmeasure": 0.08112927496184326, "rouge1_fmeasure_stderr": 0.001381745333718226, "rouge1_precision": 0.05427950798236395, "rouge1_precision_stderr": 0.0014485272262578887, "rouge1_recall": 0.2887819962687885, "rouge1_recall_stderr": 0.004836201048504248, "rouge2_fmeasure": 0.026853052835336486, "rouge2_fmeasure_stderr": 0.0007555338988946547, "rouge2_precision": 0.01781208802708474, "rouge2_precision_stderr": 0.0007300877506386661, "rouge2_recall": 0.10483227148005658, "rouge2_recall_stderr": 0.0029517707314570046, "rougeL_fmeasure": 0.06772590467337501, "rougeL_fmeasure_stderr": 0.001097382415334277, "rougeL_precision": 0.045522730494063905, "rougeL_precision_stderr": 0.0012631643140105868, "rougeL_recall": 0.24572178582889284, "rougeL_recall_stderr": 0.004101415429000939, "rougeLsum_fmeasure": 0.0740923092063082, "rougeLsum_fmeasure_stderr": 0.0012429930675603453, "rougeLsum_precision": 0.049701509372279865, "rougeLsum_precision_stderr": 0.0013473025262905167, "rougeLsum_recall": 0.2652804820622864, "rougeLsum_recall_stderr": 0.004456640957834833}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.0698281567248469, "bleu_stderr": 0.031197948676954808, "rouge1_fmeasure": 0.15293740987304577, "rouge1_fmeasure_stderr": 0.0018125502197827886, "rouge1_precision": 0.13657400102348152, "rouge1_precision_stderr": 0.0019602209071817017, "rouge1_recall": 0.21594236870418015, "rouge1_recall_stderr": 0.00252082384017723, "rouge2_fmeasure": 0.025350280308615827, "rouge2_fmeasure_stderr": 0.0007063744981585526, "rouge2_precision": 0.0229238338882169, "rouge2_precision_stderr": 0.0007667053864034924, "rouge2_recall": 0.036478343139842336, "rouge2_recall_stderr": 0.0010927774223228066, "rougeL_fmeasure": 0.1186521222139016, "rougeL_fmeasure_stderr": 0.0013050858019646526, "rougeL_precision": 0.10523920309420877, "rougeL_precision_stderr": 0.0014350881890456634, "rougeL_recall": 0.17099648018625646, "rougeL_recall_stderr": 0.001988402849794833, "rougeLsum_fmeasure": 0.14334885314947377, "rougeLsum_fmeasure_stderr": 0.001691074643329425, "rougeLsum_precision": 0.12801391165143916, "rougeLsum_precision_stderr": 0.0018449412953001052, "rougeLsum_recall": 0.20301493151029282, "rougeLsum_recall_stderr": 0.002375290861956505}}, "1": {"tldr_en": {"bleu": 1.3897199003637006, "bleu_stderr": 0.07063823378706162, "rouge1_fmeasure": 0.17640037056685856, "rouge1_fmeasure_stderr": 0.0017709475600536874, "rouge1_precision": 0.1527190793869336, "rouge1_precision_stderr": 0.0019033085607925665, "rouge1_recall": 0.2524664063557302, "rouge1_recall_stderr": 0.002417513324483763, "rouge2_fmeasure": 0.03120030466009696, "rouge2_fmeasure_stderr": 0.0007645415681082206, "rouge2_precision": 0.02718105183486924, "rouge2_precision_stderr": 0.0007144209800115836, "rouge2_recall": 0.04458435475470699, "rouge2_recall_stderr": 0.0011839595142118518, "rougeL_fmeasure": 0.12843607676528526, "rougeL_fmeasure_stderr": 0.0011912754913960411, "rougeL_precision": 0.10989541933370581, "rougeL_precision_stderr": 0.0012759456866320233, "rougeL_recall": 0.18878748532124173, "rougeL_recall_stderr": 0.0018598588741222527, "rougeLsum_fmeasure": 0.1663913504046975, "rougeLsum_fmeasure_stderr": 0.0016547117851710758, "rougeLsum_precision": 0.14390745954092274, "rougeLsum_precision_stderr": 0.0017831895165428449, "rougeLsum_recall": 0.23872710926657703, "rougeLsum_recall_stderr": 0.002283757822630687}}, "2": {"tldr_en": {"bleu": 1.4008067459167313, "bleu_stderr": 0.08827857828822949, "rouge1_fmeasure": 0.179980986040312, "rouge1_fmeasure_stderr": 0.0017390033488425191, "rouge1_precision": 0.1550390497788087, "rouge1_precision_stderr": 0.0018666807171491579, "rouge1_recall": 0.2580077899224292, "rouge1_recall_stderr": 0.00235000733006759, "rouge2_fmeasure": 0.03202152301642874, "rouge2_fmeasure_stderr": 0.0007531593839558726, "rouge2_precision": 0.027739859281398428, "rouge2_precision_stderr": 0.0006960250066685061, "rouge2_recall": 0.046577295588834305, "rouge2_recall_stderr": 0.0012136373934520796, "rougeL_fmeasure": 0.13230387563247092, "rougeL_fmeasure_stderr": 0.0011613204218787101, "rougeL_precision": 0.11243606095706739, "rougeL_precision_stderr": 0.0012224684496215659, "rougeL_recall": 0.1949503997997534, "rougeL_recall_stderr": 0.0018369703062687025, "rougeLsum_fmeasure": 0.1693617273512457, "rougeLsum_fmeasure_stderr": 0.0016258958744061421, "rougeLsum_precision": 0.14565340968537027, "rougeLsum_precision_stderr": 0.0017371493090466129, "rougeLsum_recall": 0.2433756462741806, "rougeLsum_recall_stderr": 0.002216006245636251}}, "3": {"tldr_en": {"bleu": 1.4152325868889817, "bleu_stderr": 0.06126315971630493, "rouge1_fmeasure": 0.1502260326415678, "rouge1_fmeasure_stderr": 0.0019654672483749272, "rouge1_precision": 0.13366822151647628, "rouge1_precision_stderr": 0.0020933703252128655, "rouge1_recall": 0.21650682034095572, "rouge1_recall_stderr": 0.00282120863440036, "rouge2_fmeasure": 0.027177168452482477, "rouge2_fmeasure_stderr": 0.0007286450388632726, "rouge2_precision": 0.02460982260757917, "rouge2_precision_stderr": 0.0008044561072427725, "rouge2_recall": 0.03890176754974684, "rouge2_recall_stderr": 0.00111249138447195, "rougeL_fmeasure": 0.11119511753077438, "rougeL_fmeasure_stderr": 0.001380619777826452, "rougeL_precision": 0.09845753489869857, "rougeL_precision_stderr": 0.0015125182983926264, "rougeL_recall": 0.16389755378208323, "rougeL_recall_stderr": 0.002174154355006171, "rougeLsum_fmeasure": 0.14154892995370785, "rougeLsum_fmeasure_stderr": 0.0018542083256364208, "rougeLsum_precision": 0.12596681985294686, "rougeLsum_precision_stderr": 0.0019804834099973927, "rougeLsum_recall": 0.20433261658898713, "rougeLsum_recall_stderr": 0.0026833088776520636}}, "4": {"tldr_en": {"bleu": 0.31344546144041746, "bleu_stderr": 0.033904653644469845, "rouge1_fmeasure": 0.05016152959188063, "rouge1_fmeasure_stderr": 0.001728101228322266, "rouge1_precision": 0.04578864816825674, "rouge1_precision_stderr": 0.0017185324088034625, "rouge1_recall": 0.07435108727330393, "rouge1_recall_stderr": 0.002582758592511787, "rouge2_fmeasure": 0.009128003381940237, "rouge2_fmeasure_stderr": 0.0004958439228770255, "rouge2_precision": 0.008063357150658304, "rouge2_precision_stderr": 0.0004757626380558336, "rouge2_recall": 0.013747869153150247, "rouge2_recall_stderr": 0.0007975828857424967, "rougeL_fmeasure": 0.037720056205184496, "rougeL_fmeasure_stderr": 0.0012739421837830972, "rougeL_precision": 0.034563632931568285, "rougeL_precision_stderr": 0.0013092649784433536, "rougeL_recall": 0.05696197164006717, "rougeL_recall_stderr": 0.001985930236653378, "rougeLsum_fmeasure": 0.04672288223785032, "rougeLsum_fmeasure_stderr": 0.0016075615535180764, "rougeLsum_precision": 0.0426792946937854, "rougeLsum_precision_stderr": 0.0016067691953483522, "rougeLsum_recall": 0.0694521763610228, "rougeLsum_recall_stderr": 0.0024176114065839085}}, "5": {"tldr_en": {"bleu": 4.7009798797569006e-07, "bleu_stderr": 9.938433754764014e-07, "rouge1_fmeasure": 0.008063875824543603, "rouge1_fmeasure_stderr": 0.0007777127977281093, "rouge1_precision": 0.0075656803850255114, "rouge1_precision_stderr": 0.0007657664823996809, "rouge1_recall": 0.012174642882317928, "rouge1_recall_stderr": 0.0011992941173112328, "rouge2_fmeasure": 0.00137107437347401, "rouge2_fmeasure_stderr": 0.00019512119300503364, "rouge2_precision": 0.0011570262490963271, "rouge2_precision_stderr": 0.0001707300688542471, "rouge2_recall": 0.0022164885592608316, "rouge2_recall_stderr": 0.000322047566895165, "rougeL_fmeasure": 0.006043541502258262, "rougeL_fmeasure_stderr": 0.0005687395914572521, "rougeL_precision": 0.005630382841922863, "rougeL_precision_stderr": 0.0005619428638458995, "rougeL_recall": 0.009493480040723675, "rougeL_recall_stderr": 0.0009395332219244073, "rougeLsum_fmeasure": 0.007525952069746523, "rougeLsum_fmeasure_stderr": 0.0007227857896443747, "rougeLsum_precision": 0.007097183357200199, "rougeLsum_precision_stderr": 0.0007187758543412052, "rougeLsum_recall": 0.011352084237459566, "rougeLsum_recall_stderr": 0.0011076421847382167}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.0584490875027606, "bleu_stderr": 0.05784565485090222, "rouge1_fmeasure": 0.18221996450216535, "rouge1_fmeasure_stderr": 0.0014716360435734909, "rouge1_precision": 0.1736038095061394, "rouge1_precision_stderr": 0.002114399609053922, "rouge1_recall": 0.24742422083008644, "rouge1_recall_stderr": 0.0021313847417907985, "rouge2_fmeasure": 0.04005478057090433, "rouge2_fmeasure_stderr": 0.0008355305888496138, "rouge2_precision": 0.033255523912924585, "rouge2_precision_stderr": 0.000752812254040243, "rouge2_recall": 0.05858785272346365, "rouge2_recall_stderr": 0.0012581812005181203, "rougeL_fmeasure": 0.16176145114483464, "rougeL_fmeasure_stderr": 0.0012496622966848437, "rougeL_precision": 0.15233577608768759, "rougeL_precision_stderr": 0.0017935957513507915, "rougeL_recall": 0.2227801166901782, "rougeL_recall_stderr": 0.0019464763612749606, "rougeLsum_fmeasure": 0.15373217700275651, "rougeLsum_fmeasure_stderr": 0.0013518981832995032, "rougeLsum_precision": 0.14768583348672515, "rougeLsum_precision_stderr": 0.0019357690046436949, "rougeLsum_recall": 0.20806382450379488, "rougeLsum_recall_stderr": 0.0019393113411551937}}, "1": {"generate_text_restaurant": {"bleu": 4.624680732729629, "bleu_stderr": 0.05243089436123862, "rouge1_fmeasure": 0.3004266131905225, "rouge1_fmeasure_stderr": 0.0018638820507798249, "rouge1_precision": 0.2543869109755681, "rouge1_precision_stderr": 0.0020096854602199917, "rouge1_recall": 0.4163609451161923, "rouge1_recall_stderr": 0.002652882005669645, "rouge2_fmeasure": 0.10448941521238508, "rouge2_fmeasure_stderr": 0.001293054700377555, "rouge2_precision": 0.08718193915536301, "rouge2_precision_stderr": 0.0011795545548325646, "rouge2_recall": 0.14873595778197404, "rouge2_recall_stderr": 0.0018987356487853765, "rougeL_fmeasure": 0.21990918713413035, "rougeL_fmeasure_stderr": 0.001334910216565677, "rougeL_precision": 0.18510357316615697, "rougeL_precision_stderr": 0.0014177587523362468, "rougeL_recall": 0.3086699078602609, "rougeL_recall_stderr": 0.0021116248923408795, "rougeLsum_fmeasure": 0.2516648526497101, "rougeLsum_fmeasure_stderr": 0.0017247118377250164, "rougeLsum_precision": 0.21335307311143678, "rougeLsum_precision_stderr": 0.0018154586057146727, "rougeLsum_recall": 0.3489219749502478, "rougeLsum_recall_stderr": 0.0024681408089484457}}, "2": {"generate_text_restaurant": {"bleu": 5.805176481782663, "bleu_stderr": 0.10754187782564871, "rouge1_fmeasure": 0.327314489908299, "rouge1_fmeasure_stderr": 0.0019262708389304786, "rouge1_precision": 0.2810240735763789, "rouge1_precision_stderr": 0.0021373374805073787, "rouge1_recall": 0.4393128734515783, "rouge1_recall_stderr": 0.0025783751482048043, "rouge2_fmeasure": 0.1270241766491202, "rouge2_fmeasure_stderr": 0.001370487099509288, "rouge2_precision": 0.10825122577324459, "rouge2_precision_stderr": 0.0012920031692932362, "rouge2_recall": 0.17372706946673774, "rouge2_recall_stderr": 0.0019211181687330625, "rougeL_fmeasure": 0.2377696202178137, "rougeL_fmeasure_stderr": 0.0014182753705991385, "rougeL_precision": 0.20300281283065, "rougeL_precision_stderr": 0.001530366533173726, "rougeL_recall": 0.3228799039338562, "rougeL_recall_stderr": 0.002113717188309046, "rougeLsum_fmeasure": 0.27718866736401243, "rougeLsum_fmeasure_stderr": 0.0017932520137662555, "rougeLsum_precision": 0.23778519446318125, "rougeLsum_precision_stderr": 0.0019109405793450672, "rougeLsum_recall": 0.37258311793125337, "rougeLsum_recall_stderr": 0.002447717629939462}}, "3": {"generate_text_restaurant": {"bleu": 6.0702389361305595, "bleu_stderr": 0.08194830850984278, "rouge1_fmeasure": 0.3356100394849916, "rouge1_fmeasure_stderr": 0.0018959902894004825, "rouge1_precision": 0.28663248616853076, "rouge1_precision_stderr": 0.002146723805893031, "rouge1_recall": 0.4506189673504115, "rouge1_recall_stderr": 0.002436200606032119, "rouge2_fmeasure": 0.13428696867360543, "rouge2_fmeasure_stderr": 0.0013696727400582546, "rouge2_precision": 0.11410369352717002, "rouge2_precision_stderr": 0.001304314837966636, "rouge2_recall": 0.18298958461230197, "rouge2_recall_stderr": 0.001882753474917576, "rougeL_fmeasure": 0.24232949096712741, "rougeL_fmeasure_stderr": 0.0014014918847133, "rougeL_precision": 0.20559218248043012, "rougeL_precision_stderr": 0.0015150708446965548, "rougeL_recall": 0.3295088339665089, "rougeL_recall_stderr": 0.002056598014809842, "rougeLsum_fmeasure": 0.2841487020857327, "rougeLsum_fmeasure_stderr": 0.0017583889283832343, "rougeLsum_precision": 0.24238497536831147, "rougeLsum_precision_stderr": 0.0019089058807953963, "rougeLsum_recall": 0.3825657874017383, "rougeLsum_recall_stderr": 0.0023498365548672035}}, "4": {"generate_text_restaurant": {"bleu": 6.151131000451251, "bleu_stderr": 0.08675882994639339, "rouge1_fmeasure": 0.3359817537488032, "rouge1_fmeasure_stderr": 0.0018489187424229685, "rouge1_precision": 0.28560539279452085, "rouge1_precision_stderr": 0.0021099291165378284, "rouge1_recall": 0.45430689331004465, "rouge1_recall_stderr": 0.0024026673233587116, "rouge2_fmeasure": 0.13608583415947614, "rouge2_fmeasure_stderr": 0.0013564323328534673, "rouge2_precision": 0.11510321511249016, "rouge2_precision_stderr": 0.001301343435555447, "rouge2_recall": 0.1874680517865149, "rouge2_recall_stderr": 0.0019021887451390039, "rougeL_fmeasure": 0.24041522034212437, "rougeL_fmeasure_stderr": 0.0013766856123865099, "rougeL_precision": 0.2031567857133821, "rougeL_precision_stderr": 0.0015001239767546484, "rougeL_recall": 0.3289852955373659, "rougeL_recall_stderr": 0.002037387218993908, "rougeLsum_fmeasure": 0.2839218883334275, "rougeLsum_fmeasure_stderr": 0.0017317527531115658, "rougeLsum_precision": 0.24102976436421542, "rougeLsum_precision_stderr": 0.0018824356028074483, "rougeLsum_recall": 0.3848949410972484, "rougeLsum_recall_stderr": 0.002333108882205964}}, "5": {"generate_text_restaurant": {"bleu": 5.996601972874363, "bleu_stderr": 0.062498386190361466, "rouge1_fmeasure": 0.332744298195428, "rouge1_fmeasure_stderr": 0.0018083359222420695, "rouge1_precision": 0.27984037424342584, "rouge1_precision_stderr": 0.002035356724695903, "rouge1_recall": 0.4562214678265182, "rouge1_recall_stderr": 0.0023267113497673297, "rouge2_fmeasure": 0.1340355684574405, "rouge2_fmeasure_stderr": 0.0013382542668832539, "rouge2_precision": 0.11229104687279592, "rouge2_precision_stderr": 0.0012711202305475892, "rouge2_recall": 0.18651849928291545, "rouge2_recall_stderr": 0.0018559080307442974, "rougeL_fmeasure": 0.2380870181376768, "rougeL_fmeasure_stderr": 0.0013780430632687817, "rougeL_precision": 0.1990924401150559, "rougeL_precision_stderr": 0.0014686493900604777, "rougeL_recall": 0.3301426182345506, "rougeL_recall_stderr": 0.0020130032251455187, "rougeLsum_fmeasure": 0.2813556234789683, "rougeLsum_fmeasure_stderr": 0.0017104150207607595, "rougeLsum_precision": 0.23632585055357197, "rougeLsum_precision_stderr": 0.0018266202105727374, "rougeLsum_recall": 0.38651218746639665, "rougeLsum_recall_stderr": 0.002280924331466531}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.2351135588590574, "bleu_stderr": 0.10488694867377531, "rouge1_fmeasure": 0.18657848599860072, "rouge1_fmeasure_stderr": 0.002222975019108831, "rouge1_precision": 0.13545711524545212, "rouge1_precision_stderr": 0.0018643786215580213, "rouge1_recall": 0.32266286403872463, "rouge1_recall_stderr": 0.0037254982882621427, "rouge2_fmeasure": 0.03474841233857288, "rouge2_fmeasure_stderr": 0.001233981329533342, "rouge2_precision": 0.025159941054200875, "rouge2_precision_stderr": 0.0009516205224541807, "rouge2_recall": 0.061291402142246806, "rouge2_recall_stderr": 0.0022285049276297964, "rougeL_fmeasure": 0.13894822401398402, "rougeL_fmeasure_stderr": 0.0016146564397941019, "rougeL_precision": 0.1008272260974001, "rougeL_precision_stderr": 0.001407515651441248, "rougeL_recall": 0.24190386360768146, "rougeL_recall_stderr": 0.002838177397645114, "rougeLsum_fmeasure": 0.1478827026547576, "rougeLsum_fmeasure_stderr": 0.0018092099707973965, "rougeLsum_precision": 0.10716035090872196, "rougeLsum_precision_stderr": 0.001519960553820242, "rougeLsum_recall": 0.2575468596418776, "rougeLsum_recall_stderr": 0.0031850609628431103}}, "1": {"article_DOC_summary": {"bleu": 0.5385269115414006, "bleu_stderr": 0.03829764057299067, "rouge1_fmeasure": 0.15034170474928443, "rouge1_fmeasure_stderr": 0.0019370955875090581, "rouge1_precision": 0.1067774448180075, "rouge1_precision_stderr": 0.001466077477126152, "rouge1_recall": 0.2651524302883161, "rouge1_recall_stderr": 0.0032156584359543067, "rouge2_fmeasure": 0.018575263603814434, "rouge2_fmeasure_stderr": 0.0008240738103629275, "rouge2_precision": 0.013075222110902597, "rouge2_precision_stderr": 0.0005813330602233087, "rouge2_recall": 0.03359114921273582, "rouge2_recall_stderr": 0.0015545772848912316, "rougeL_fmeasure": 0.11492744620012164, "rougeL_fmeasure_stderr": 0.001390753139093343, "rougeL_precision": 0.08148005095710249, "rougeL_precision_stderr": 0.0010485301608512115, "rougeL_recall": 0.20397422511597993, "rougeL_recall_stderr": 0.0024200460142002364, "rougeLsum_fmeasure": 0.12151876380190761, "rougeLsum_fmeasure_stderr": 0.001536100169936514, "rougeLsum_precision": 0.08610780549671544, "rougeLsum_precision_stderr": 0.0011513730347375746, "rougeLsum_recall": 0.21571629569327896, "rougeLsum_recall_stderr": 0.002668106390230952}}, "2": {"article_DOC_summary": {"bleu": 0.5382517158144792, "bleu_stderr": 0.04713019088308223, "rouge1_fmeasure": 0.14732220631324955, "rouge1_fmeasure_stderr": 0.0018884299672276643, "rouge1_precision": 0.10440701535898894, "rouge1_precision_stderr": 0.0014243914537581347, "rouge1_recall": 0.26017178467312074, "rouge1_recall_stderr": 0.0030916457399710485, "rouge2_fmeasure": 0.01723734576335539, "rouge2_fmeasure_stderr": 0.0008037190863709996, "rouge2_precision": 0.012199808372382208, "rouge2_precision_stderr": 0.0005707544309962022, "rouge2_recall": 0.030378117296604706, "rouge2_recall_stderr": 0.0014271440223042635, "rougeL_fmeasure": 0.11311348504118052, "rougeL_fmeasure_stderr": 0.0013671011090786072, "rougeL_precision": 0.07998976910690078, "rougeL_precision_stderr": 0.0010225901693129691, "rougeL_recall": 0.20105739218461574, "rougeL_recall_stderr": 0.0023407801195794316, "rougeLsum_fmeasure": 0.11850893467131327, "rougeLsum_fmeasure_stderr": 0.0015069429287548285, "rougeLsum_precision": 0.08376718696724832, "rougeLsum_precision_stderr": 0.0011190405511453988, "rougeLsum_recall": 0.21070773221761344, "rougeLsum_recall_stderr": 0.002585237268995173}}, "3": {"article_DOC_summary": {"bleu": 0.5039224360169209, "bleu_stderr": 0.035167806622985444, "rouge1_fmeasure": 0.13908613833766742, "rouge1_fmeasure_stderr": 0.0020684272348752896, "rouge1_precision": 0.10068688234642065, "rouge1_precision_stderr": 0.0016556534248994925, "rouge1_recall": 0.24301625963377552, "rouge1_recall_stderr": 0.0035599829357810284, "rouge2_fmeasure": 0.016445615664662447, "rouge2_fmeasure_stderr": 0.0007997290135518104, "rouge2_precision": 0.011907786317892789, "rouge2_precision_stderr": 0.0006007470885500614, "rouge2_recall": 0.029749140100470382, "rouge2_recall_stderr": 0.0015219049695756218, "rougeL_fmeasure": 0.10643938966941947, "rougeL_fmeasure_stderr": 0.0014828913289466287, "rougeL_precision": 0.07709856630175962, "rougeL_precision_stderr": 0.0012254350078061032, "rougeL_recall": 0.18703824979081757, "rougeL_recall_stderr": 0.0026685040024184187, "rougeLsum_fmeasure": 0.11365372460167819, "rougeLsum_fmeasure_stderr": 0.0016760324913324752, "rougeLsum_precision": 0.08221528231673934, "rougeLsum_precision_stderr": 0.0013498723180839932, "rougeLsum_recall": 0.1997525960776492, "rougeLsum_recall_stderr": 0.0029883312380197566}}, "4": {"article_DOC_summary": {"bleu": 0.20010351872206222, "bleu_stderr": 0.04387809393257718, "rouge1_fmeasure": 0.036831079186109235, "rouge1_fmeasure_stderr": 0.0020173636103753925, "rouge1_precision": 0.030739367389889232, "rouge1_precision_stderr": 0.0018163536266503726, "rouge1_recall": 0.05866870468713456, "rouge1_recall_stderr": 0.0032747003607880368, "rouge2_fmeasure": 0.0032383129780917384, "rouge2_fmeasure_stderr": 0.0003927160520613121, "rouge2_precision": 0.0024201488663514233, "rouge2_precision_stderr": 0.0003185574386000049, "rouge2_recall": 0.005529827499208075, "rouge2_recall_stderr": 0.0006405511209186582, "rougeL_fmeasure": 0.028660663031091726, "rougeL_fmeasure_stderr": 0.0015660227258984319, "rougeL_precision": 0.024336991646645382, "rougeL_precision_stderr": 0.001496163941574667, "rougeL_recall": 0.045630180771856745, "rougeL_recall_stderr": 0.0025390312076075333, "rougeLsum_fmeasure": 0.030628925036096075, "rougeLsum_fmeasure_stderr": 0.0016789537017533084, "rougeLsum_precision": 0.025836159019823328, "rougeLsum_precision_stderr": 0.0015674104571482793, "rougeLsum_recall": 0.04898129982644303, "rougeLsum_recall_stderr": 0.0027617930906188278}}, "5": {"article_DOC_summary": {"bleu": 1.0315801742136008e-38, "bleu_stderr": 5.992560226402246e-33, "rouge1_fmeasure": 0.0021358154630314085, "rouge1_fmeasure_stderr": 0.0005912295186770688, "rouge1_precision": 0.0023843465275499858, "rouge1_precision_stderr": 0.0006674291044716917, "rouge1_recall": 0.0019892304519971244, "rouge1_recall_stderr": 0.000547530792870024, "rouge2_fmeasure": 0.0002575849033001904, "rouge2_fmeasure_stderr": 0.00013307083152500616, "rouge2_precision": 0.0003000809122662035, "rouge2_precision_stderr": 0.0001558745365999162, "rouge2_recall": 0.00022870211549456832, "rouge2_recall_stderr": 0.00011861331033677322, "rougeL_fmeasure": 0.0018299461009661584, "rougeL_fmeasure_stderr": 0.0005179732110124337, "rougeL_precision": 0.002046828730265263, "rougeL_precision_stderr": 0.0005851338450962457, "rougeL_recall": 0.0017042261942431527, "rougeL_recall_stderr": 0.0004803344440760027, "rougeLsum_fmeasure": 0.0018789536971435658, "rougeLsum_fmeasure_stderr": 0.0005293007243818104, "rougeLsum_precision": 0.002097277726330241, "rougeLsum_precision_stderr": 0.000595767228097712, "rougeLsum_recall": 0.0017518724683045208, "rougeLsum_recall_stderr": 0.0004918743850837456}}}} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_0.csv b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..9731d76800e4b86274bbc10ee7d45de133bb98ca --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.314,0.01468399195108797,0 +anli_r2,acc,0.345,0.015039986742055237,0 +anli_r3,acc,0.3325,0.013605417345710526,0 +arc_challenge,acc,0.22525597269624573,0.01220783999540731,0 +arc_challenge,acc_norm,0.26109215017064846,0.01283552390947385,0 +arc_easy,acc,0.47685185185185186,0.010248782484554474,0 +arc_easy,acc_norm,0.42634680134680136,0.010147858603835134,0 +boolq,acc,0.4712538226299694,0.008730590188717146,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.35137701804368476,,1 +copa,acc,0.71,0.04560480215720684,0 +hellaswag,acc,0.380601473809998,0.004845424524764033,0 +hellaswag,acc_norm,0.45429197371041624,0.00496888813029006,0 +piqa,acc,0.6936887921653971,0.01075497003236732,0 +piqa,acc_norm,0.7013057671381937,0.010678556398149226,0 +rte,acc,0.44765342960288806,0.02993107036293953,0 +sciq,acc,0.706,0.014414290540008213,0 +sciq,acc_norm,0.64,0.015186527932040122,0 +storycloze_2016,acc,0.6493853554249065,0.01103431729046329,0 +winogrande,acc,0.5138121546961326,0.014047122916440419,0 diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_0_lm-eval_global_step52452_2023-02-11-12-41-27_0shots_backup.json b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_0_lm-eval_global_step52452_2023-02-11-12-41-27_0shots_backup.json deleted file mode 100644 index ec7cd30dcfd74be4413b80a3c4e6b50570292573..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_0_lm-eval_global_step52452_2023-02-11-12-41-27_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.314, - "acc_stderr": 0.01468399195108797 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.013605417345710526 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.35137701804368476 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.04560480215720684 - }, - "hellaswag": { - "acc": 0.380601473809998, - "acc_stderr": 0.004845424524764033, - "acc_norm": 0.45429197371041624, - "acc_norm_stderr": 0.00496888813029006 - }, - "rte": { - "acc": 0.44765342960288806, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5138121546961326, - "acc_stderr": 0.014047122916440419 - }, - "storycloze_2016": { - "acc": 0.6493853554249065, - "acc_stderr": 0.01103431729046329 - }, - "boolq": { - "acc": 0.4712538226299694, - "acc_stderr": 0.008730590188717146 - }, - "arc_easy": { - "acc": 0.47685185185185186, - "acc_stderr": 0.010248782484554474, - "acc_norm": 0.42634680134680136, - "acc_norm_stderr": 0.010147858603835134 - }, - "arc_challenge": { - "acc": 0.22525597269624573, - "acc_stderr": 0.01220783999540731, - "acc_norm": 0.26109215017064846, - "acc_norm_stderr": 0.01283552390947385 - }, - "sciq": { - "acc": 0.706, - "acc_stderr": 0.014414290540008213, - "acc_norm": 0.64, - "acc_norm_stderr": 0.015186527932040122 - }, - "piqa": { - "acc": 0.6936887921653971, - "acc_stderr": 0.01075497003236732, - "acc_norm": 0.7013057671381937, - "acc_norm_stderr": 0.010678556398149226 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_1.csv b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..1196e51f685ca304f4e037587b137fd06278a850 --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.014876872027456734,0 +anli_r2,acc,0.328,0.014853842487270334,0 +anli_r3,acc,0.3258333333333333,0.013535422043417464,0 +arc_challenge,acc,0.22440273037542663,0.012191404938603836,0 +arc_challenge,acc_norm,0.2508532423208191,0.01266819862131543,0 +arc_easy,acc,0.46254208754208753,0.010230952104570803,0 +arc_easy,acc_norm,0.42003367003367004,0.010127718838529405,0 +boolq,acc,0.47522935779816516,0.008734316719387784,1 +cb,acc,0.5357142857142857,0.0672477765493766,1 +cb,f1,0.37360259646592137,,1 +copa,acc,0.65,0.0479372485441102,0 +hellaswag,acc,0.37880900219079866,0.004840990593494679,0 +hellaswag,acc_norm,0.45249950209121687,0.004967213515483206,0 +piqa,acc,0.6811751904243744,0.010873037534333418,0 +piqa,acc_norm,0.6936887921653971,0.010754970032367323,0 +rte,acc,0.44404332129963897,0.029907396333795987,0 +sciq,acc,0.69,0.014632638658632896,0 +sciq,acc_norm,0.653,0.015060472031706617,0 +storycloze_2016,acc,0.640833778727953,0.011094293150908333,0 +winogrande,acc,0.5288082083662194,0.014029141615909612,0 diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json deleted file mode 100644 index 186f65ba8cd682db6c32d324eeb7ee72a65c9dd8..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.014876872027456734 - }, - "anli_r2": { - "acc": 0.328, - "acc_stderr": 0.014853842487270334 - }, - "anli_r3": { - "acc": 0.3258333333333333, - "acc_stderr": 0.013535422043417464 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.0672477765493766, - "f1": 0.37360259646592137 - }, - "copa": { - "acc": 0.65, - "acc_stderr": 0.0479372485441102 - }, - "hellaswag": { - "acc": 0.37880900219079866, - "acc_stderr": 0.004840990593494679, - "acc_norm": 0.45249950209121687, - "acc_norm_stderr": 0.004967213515483206 - }, - "rte": { - "acc": 0.44404332129963897, - "acc_stderr": 0.029907396333795987 - }, - "winogrande": { - "acc": 0.5288082083662194, - "acc_stderr": 0.014029141615909612 - }, - "storycloze_2016": { - "acc": 0.640833778727953, - "acc_stderr": 0.011094293150908333 - }, - "boolq": { - "acc": 0.47522935779816516, - "acc_stderr": 0.008734316719387784 - }, - "arc_easy": { - "acc": 0.46254208754208753, - "acc_stderr": 0.010230952104570803, - "acc_norm": 0.42003367003367004, - "acc_norm_stderr": 0.010127718838529405 - }, - "arc_challenge": { - "acc": 0.22440273037542663, - "acc_stderr": 0.012191404938603836, - "acc_norm": 0.2508532423208191, - "acc_norm_stderr": 0.01266819862131543 - }, - "sciq": { - "acc": 0.69, - "acc_stderr": 0.014632638658632896, - "acc_norm": 0.653, - "acc_norm_stderr": 0.015060472031706617 - }, - "piqa": { - "acc": 0.6811751904243744, - "acc_stderr": 0.010873037534333418, - "acc_norm": 0.6936887921653971, - "acc_norm_stderr": 0.010754970032367323 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_2.csv b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..8df420744a85039108d7b25b55cc02ea6d368c6c --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.351,0.015100563798316403,0 +anli_r2,acc,0.347,0.015060472031706618,0 +anli_r3,acc,0.32916666666666666,0.013570806258433626,0 +arc_challenge,acc,0.21928327645051193,0.012091245787615727,0 +arc_challenge,acc_norm,0.24829351535836178,0.012624912868089762,0 +arc_easy,acc,0.4524410774410774,0.010213265860171399,0 +arc_easy,acc_norm,0.4356060606060606,0.010174341733665219,0 +boolq,acc,0.4761467889908257,0.008735097860690572,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.347985347985348,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.38239394542919736,0.004849788423944362,0 +hellaswag,acc_norm,0.45947022505477,0.004973361339169645,0 +piqa,acc,0.6882480957562568,0.010807431424873677,0 +piqa,acc_norm,0.6822633297062024,0.010863133246569286,0 +rte,acc,0.5018050541516246,0.030096267148976633,0 +sciq,acc,0.708,0.014385511563477347,0 +sciq,acc_norm,0.682,0.014734079309311901,0 +storycloze_2016,acc,0.640833778727953,0.01109429315090833,0 +winogrande,acc,0.5288082083662194,0.014029141615909617,0 diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json deleted file mode 100644 index fa5bec6b55632309239a08a73a245e0bcd938ae7..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_2_lm-eval_global_step52452_2023-02-11-12-41-27_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316403 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706618 - }, - "anli_r3": { - "acc": 0.32916666666666666, - "acc_stderr": 0.013570806258433626 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644647, - "f1": 0.347985347985348 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - }, - "hellaswag": { - "acc": 0.38239394542919736, - "acc_stderr": 0.004849788423944362, - "acc_norm": 0.45947022505477, - "acc_norm_stderr": 0.004973361339169645 - }, - "rte": { - "acc": 0.5018050541516246, - "acc_stderr": 0.030096267148976633 - }, - "winogrande": { - "acc": 0.5288082083662194, - "acc_stderr": 0.014029141615909617 - }, - "storycloze_2016": { - "acc": 0.640833778727953, - "acc_stderr": 0.01109429315090833 - }, - "boolq": { - "acc": 0.4761467889908257, - "acc_stderr": 0.008735097860690572 - }, - "arc_easy": { - "acc": 0.4524410774410774, - "acc_stderr": 0.010213265860171399, - "acc_norm": 0.4356060606060606, - "acc_norm_stderr": 0.010174341733665219 - }, - "arc_challenge": { - "acc": 0.21928327645051193, - "acc_stderr": 0.012091245787615727, - "acc_norm": 0.24829351535836178, - "acc_norm_stderr": 0.012624912868089762 - }, - "sciq": { - "acc": 0.708, - "acc_stderr": 0.014385511563477347, - "acc_norm": 0.682, - "acc_norm_stderr": 0.014734079309311901 - }, - "piqa": { - "acc": 0.6882480957562568, - "acc_stderr": 0.010807431424873677, - "acc_norm": 0.6822633297062024, - "acc_norm_stderr": 0.010863133246569286 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_3.csv b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..8d6ae98870876ee5dab73296a7ccbf1564eef3d1 --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.31,0.014632638658632896,0 +anli_r2,acc,0.326,0.014830507204541031,0 +anli_r3,acc,0.3283333333333333,0.013562032919529024,0 +arc_challenge,acc,0.23122866894197952,0.012320858834772283,0 +arc_challenge,acc_norm,0.25597269624573377,0.012753013241244525,0 +arc_easy,acc,0.44991582491582494,0.010208181969301794,0 +arc_easy,acc_norm,0.4351851851851852,0.010173216430370923,0 +boolq,acc,0.4709480122324159,0.008730280528451546,1 +cb,acc,0.5178571428571429,0.06737697508644648,1 +cb,f1,0.352851128970532,,1 +copa,acc,0.68,0.04688261722621504,0 +hellaswag,acc,0.3783110934076877,0.004839746491523509,0 +hellaswag,acc_norm,0.4525990838478391,0.0049673082544257575,0 +piqa,acc,0.6730141458106638,0.010945157126978224,0 +piqa,acc_norm,0.676278563656148,0.010916765010708762,0 +rte,acc,0.4981949458483754,0.030096267148976626,0 +sciq,acc,0.712,0.01432694179723156,0 +sciq,acc_norm,0.7,0.014498627873361421,0 +storycloze_2016,acc,0.6445750935328701,0.011068528452399879,0 +winogrande,acc,0.5146014206787688,0.014046492383275834,0 diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json deleted file mode 100644 index 4f7c269d6276b0ccee007184c19dbbafe513102e..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_3_lm-eval_global_step52452_2023-02-11-12-41-26_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.31, - "acc_stderr": 0.014632638658632896 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.014830507204541031 - }, - "anli_r3": { - "acc": 0.3283333333333333, - "acc_stderr": 0.013562032919529024 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644648, - "f1": 0.352851128970532 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621504 - }, - "hellaswag": { - "acc": 0.3783110934076877, - "acc_stderr": 0.004839746491523509, - "acc_norm": 0.4525990838478391, - "acc_norm_stderr": 0.0049673082544257575 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.5146014206787688, - "acc_stderr": 0.014046492383275834 - }, - "storycloze_2016": { - "acc": 0.6445750935328701, - "acc_stderr": 0.011068528452399879 - }, - "boolq": { - "acc": 0.4709480122324159, - "acc_stderr": 0.008730280528451546 - }, - "arc_easy": { - "acc": 0.44991582491582494, - "acc_stderr": 0.010208181969301794, - "acc_norm": 0.4351851851851852, - "acc_norm_stderr": 0.010173216430370923 - }, - "arc_challenge": { - "acc": 0.23122866894197952, - "acc_stderr": 0.012320858834772283, - "acc_norm": 0.25597269624573377, - "acc_norm_stderr": 0.012753013241244525 - }, - "sciq": { - "acc": 0.712, - "acc_stderr": 0.01432694179723156, - "acc_norm": 0.7, - "acc_norm_stderr": 0.014498627873361421 - }, - "piqa": { - "acc": 0.6730141458106638, - "acc_stderr": 0.010945157126978224, - "acc_norm": 0.676278563656148, - "acc_norm_stderr": 0.010916765010708762 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_4.csv b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..160871a1a63380c4b98e8caf51d17c9ed4bcdc07 --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.338,0.014965960710224472,0 +anli_r3,acc,0.32666666666666666,0.013544340907003663,0 +arc_challenge,acc,0.2295221843003413,0.012288926760890778,0 +arc_challenge,acc_norm,0.24914675767918087,0.012639407111926433,0 +arc_easy,acc,0.4524410774410774,0.010213265860171399,0 +arc_easy,acc_norm,0.4276094276094276,0.010151683397430687,0 +boolq,acc,0.4620795107033639,0.00871986856715964,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.28801528801528803,,1 +copa,acc,0.65,0.0479372485441102,0 +hellaswag,acc,0.37860983867755427,0.004840493603166199,0 +hellaswag,acc_norm,0.45976897032463654,0.004973602904247794,0 +piqa,acc,0.6795429815016322,0.010887766073814874,0 +piqa,acc_norm,0.6741022850924918,0.010935760218903948,0 +rte,acc,0.5018050541516246,0.030096267148976633,0 +sciq,acc,0.713,0.014312087053809963,0 +sciq,acc_norm,0.69,0.0146326386586329,0 +storycloze_2016,acc,0.6317477284874399,0.011153823258531747,0 +winogrande,acc,0.5240726124704025,0.014036189665395134,0 diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_4_lm-eval_global_step52452_2023-02-11-12-41-26_4shots_backup.json b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_4_lm-eval_global_step52452_2023-02-11-12-41-26_4shots_backup.json deleted file mode 100644 index decb8b843ec4260df1ab22593a5e2ec4b81ba4d8..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_4_lm-eval_global_step52452_2023-02-11-12-41-26_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224472 - }, - "anli_r3": { - "acc": 0.32666666666666666, - "acc_stderr": 0.013544340907003663 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.28801528801528803 - }, - "copa": { - "acc": 0.65, - "acc_stderr": 0.0479372485441102 - }, - "hellaswag": { - "acc": 0.37860983867755427, - "acc_stderr": 0.004840493603166199, - "acc_norm": 0.45976897032463654, - "acc_norm_stderr": 0.004973602904247794 - }, - "rte": { - "acc": 0.5018050541516246, - "acc_stderr": 0.030096267148976633 - }, - "winogrande": { - "acc": 0.5240726124704025, - "acc_stderr": 0.014036189665395134 - }, - "storycloze_2016": { - "acc": 0.6317477284874399, - "acc_stderr": 0.011153823258531747 - }, - "boolq": { - "acc": 0.4620795107033639, - "acc_stderr": 0.00871986856715964 - }, - "arc_easy": { - "acc": 0.4524410774410774, - "acc_stderr": 0.010213265860171399, - "acc_norm": 0.4276094276094276, - "acc_norm_stderr": 0.010151683397430687 - }, - "arc_challenge": { - "acc": 0.2295221843003413, - "acc_stderr": 0.012288926760890778, - "acc_norm": 0.24914675767918087, - "acc_norm_stderr": 0.012639407111926433 - }, - "sciq": { - "acc": 0.713, - "acc_stderr": 0.014312087053809963, - "acc_norm": 0.69, - "acc_norm_stderr": 0.0146326386586329 - }, - "piqa": { - "acc": 0.6795429815016322, - "acc_stderr": 0.010887766073814874, - "acc_norm": 0.6741022850924918, - "acc_norm_stderr": 0.010935760218903948 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_5.csv b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..33affa9bad66b030053522b658dc16493396a1dc --- /dev/null +++ b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203936,0 +anli_r2,acc,0.325,0.014818724459095524,0 +anli_r3,acc,0.345,0.013728421539454878,0 +arc_challenge,acc,0.2380546075085324,0.012445770028026208,0 +arc_challenge,acc_norm,0.2525597269624573,0.012696728980207706,0 +arc_easy,acc,0.44191919191919193,0.010190328123071765,0 +arc_easy,acc_norm,0.4335016835016835,0.010168640625454115,0 +boolq,acc,0.4617737003058104,0.00871946009810685,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.2925863044708545,,1 +copa,acc,0.67,0.047258156262526066,0 +hellaswag,acc,0.3778131846245768,0.0048384969668239025,0 +hellaswag,acc_norm,0.45578570005974905,0.0049702340327283006,0 +piqa,acc,0.6877040261153428,0.010812581599154424,0 +piqa,acc_norm,0.6849836779107725,0.010838072746240652,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.726,0.014111099288259585,0 +sciq,acc_norm,0.692,0.014606483127342761,0 +storycloze_2016,acc,0.638161411010155,0.011112247531047545,0 +winogrande,acc,0.5303867403314917,0.01402651083942874,0 diff --git a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_5_lm-eval_global_step52452_2023-02-11-12-41-26_5shots_backup.json b/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_5_lm-eval_global_step52452_2023-02-11-12-41-26_5shots_backup.json deleted file mode 100644 index 7b101130ae0e49dd93e87109f70ef96193feebd7..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed3/evaluation/rankeval/2b855b1b25c4seed3_5_lm-eval_global_step52452_2023-02-11-12-41-26_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203936 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095524 - }, - "anli_r3": { - "acc": 0.345, - "acc_stderr": 0.013728421539454878 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644647, - "f1": 0.2925863044708545 - }, - "copa": { - "acc": 0.67, - "acc_stderr": 0.047258156262526066 - }, - "hellaswag": { - "acc": 0.3778131846245768, - "acc_stderr": 0.0048384969668239025, - "acc_norm": 0.45578570005974905, - "acc_norm_stderr": 0.0049702340327283006 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5303867403314917, - "acc_stderr": 0.01402651083942874 - }, - "storycloze_2016": { - "acc": 0.638161411010155, - "acc_stderr": 0.011112247531047545 - }, - "boolq": { - "acc": 0.4617737003058104, - "acc_stderr": 0.00871946009810685 - }, - "arc_easy": { - "acc": 0.44191919191919193, - "acc_stderr": 0.010190328123071765, - "acc_norm": 0.4335016835016835, - "acc_norm_stderr": 0.010168640625454115 - }, - "arc_challenge": { - "acc": 0.2380546075085324, - "acc_stderr": 0.012445770028026208, - "acc_norm": 0.2525597269624573, - "acc_norm_stderr": 0.012696728980207706 - }, - "sciq": { - "acc": 0.726, - "acc_stderr": 0.014111099288259585, - "acc_norm": 0.692, - "acc_norm_stderr": 0.014606483127342761 - }, - "piqa": { - "acc": 0.6877040261153428, - "acc_stderr": 0.010812581599154424, - "acc_norm": 0.6849836779107725, - "acc_norm_stderr": 0.010838072746240652 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/generation/merged.csv b/2b855b1b25c4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..e062066fd383d100d029e31cf7a983d7df4dc7cb --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.002908975138220329 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.002908975138220329 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.10381155313855887 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.10381155313855887 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1139949279976248 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1139949279976248 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.11696994172728126 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.11696994172728126 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.11758078614483855 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.11758078614483855 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.11693768290607841 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.11693768290607841 +e2e_nlg_cleaned,5,average,multiple,0.0953673111754337 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.03222831347895068 +gem_xsum,0,median,rouge2_fmeasure,0.03222831347895068 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.021368328865681116 +gem_xsum,1,median,rouge2_fmeasure,0.021368328865681116 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.01942353387393521 +gem_xsum,2,median,rouge2_fmeasure,0.01942353387393521 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.018553764265445295 +gem_xsum,3,median,rouge2_fmeasure,0.018553764265445295 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.004421681388038414 +gem_xsum,4,median,rouge2_fmeasure,0.004421681388038414 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00018162755947646124 +gem_xsum,5,median,rouge2_fmeasure,0.00018162755947646124 +gem_xsum,5,average,multiple,0.016029541571921196 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04014126313147003 +web_nlg_en,0,median,rouge2_fmeasure,0.04014126313147003 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.030282421503935477 +web_nlg_en,1,median,rouge2_fmeasure,0.030282421503935477 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.030178162063438194 +web_nlg_en,2,median,rouge2_fmeasure,0.030178162063438194 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.030257065984893673 +web_nlg_en,3,median,rouge2_fmeasure,0.030257065984893673 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.03273432950647903 +web_nlg_en,4,median,rouge2_fmeasure,0.03273432950647903 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.03173794735917154 +web_nlg_en,5,median,rouge2_fmeasure,0.03173794735917154 +web_nlg_en,5,average,multiple,0.032555198258231326 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.02574727220612488 +wiki_lingua_en,0,median,rouge2_fmeasure,0.02574727220612488 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.02701999493867571 +wiki_lingua_en,1,median,rouge2_fmeasure,0.02701999493867571 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.027716761017832728 +wiki_lingua_en,2,median,rouge2_fmeasure,0.027716761017832728 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.023695385083786366 +wiki_lingua_en,3,median,rouge2_fmeasure,0.023695385083786366 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.007893612316120567 +wiki_lingua_en,4,median,rouge2_fmeasure,0.007893612316120567 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0012442077120205612 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0012442077120205612 +wiki_lingua_en,5,average,multiple,0.018886205545760135 diff --git a/2b855b1b25c4seed4/evaluation/generation/merged.json b/2b855b1b25c4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..638f758050a7d623e40c996d5f6d7a45161c7378 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.41338250912807245, "bleu_stderr": 0.03777471353872479, "rouge1_fmeasure": 0.09569634315788397, "rouge1_fmeasure_stderr": 0.0019140901485651678, "rouge1_precision": 0.0738054036631914, "rouge1_precision_stderr": 0.002547278697760544, "rouge1_recall": 0.2769917944239361, "rouge1_recall_stderr": 0.004676528441032924, "rouge2_fmeasure": 0.04014126313147003, "rouge2_fmeasure_stderr": 0.001202556938499064, "rouge2_precision": 0.029996640324883733, "rouge2_precision_stderr": 0.001524247939763139, "rouge2_recall": 0.1194386549334523, "rouge2_recall_stderr": 0.0029977146212651776, "rougeL_fmeasure": 0.08707414063184835, "rougeL_fmeasure_stderr": 0.0017058075998820782, "rougeL_precision": 0.06696274119904323, "rougeL_precision_stderr": 0.002315394983643, "rougeL_recall": 0.25617393481632134, "rougeL_recall_stderr": 0.004408585848417626, "rougeLsum_fmeasure": 0.08835595022129512, "rougeLsum_fmeasure_stderr": 0.0017544947903355987, "rougeLsum_precision": 0.06827907294852234, "rougeLsum_precision_stderr": 0.002368743034642765, "rougeLsum_recall": 0.25634433420900943, "rougeLsum_recall_stderr": 0.0042794224603852965}}, "1": {"PALM_prompt": {"bleu": 0.3929871678506059, "bleu_stderr": 0.027113502088038446, "rouge1_fmeasure": 0.08306012732859218, "rouge1_fmeasure_stderr": 0.0016238408660472071, "rouge1_precision": 0.05691627660396575, "rouge1_precision_stderr": 0.0017970175809284821, "rouge1_recall": 0.2800397030567386, "rouge1_recall_stderr": 0.004596190687481955, "rouge2_fmeasure": 0.030282421503935477, "rouge2_fmeasure_stderr": 0.0009381068007328495, "rouge2_precision": 0.021374684526603238, "rouge2_precision_stderr": 0.0011437787669306358, "rouge2_recall": 0.10578708854611155, "rouge2_recall_stderr": 0.002935401548872945, "rougeL_fmeasure": 0.0724927363968997, "rougeL_fmeasure_stderr": 0.001386257242458974, "rougeL_precision": 0.04954161129365949, "rougeL_precision_stderr": 0.0015865464719097362, "rougeL_recall": 0.2483104267940398, "rougeL_recall_stderr": 0.004108925013149585, "rougeLsum_fmeasure": 0.07598928569827458, "rougeLsum_fmeasure_stderr": 0.0014789102455151728, "rougeLsum_precision": 0.052205834315103006, "rougeLsum_precision_stderr": 0.00167829934652265, "rougeLsum_recall": 0.2572982859437576, "rougeLsum_recall_stderr": 0.004214311358132706}}, "2": {"PALM_prompt": {"bleu": 0.3818361462497907, "bleu_stderr": 0.02155928570036411, "rouge1_fmeasure": 0.08337510038261521, "rouge1_fmeasure_stderr": 0.0014003884326804653, "rouge1_precision": 0.05471172904471613, "rouge1_precision_stderr": 0.0014091432862995145, "rouge1_recall": 0.29846179395150235, "rouge1_recall_stderr": 0.00460078147308108, "rouge2_fmeasure": 0.030178162063438194, "rouge2_fmeasure_stderr": 0.0008363786012061165, "rouge2_precision": 0.019792460115797387, "rouge2_precision_stderr": 0.0008611865471653958, "rouge2_recall": 0.11650190651765796, "rouge2_recall_stderr": 0.0030990055062325695, "rougeL_fmeasure": 0.07208323808371675, "rougeL_fmeasure_stderr": 0.0012041580525133998, "rougeL_precision": 0.047419413416286055, "rougeL_precision_stderr": 0.0012872447031911885, "rougeL_recall": 0.2607372459471555, "rougeL_recall_stderr": 0.004085463623376844, "rougeLsum_fmeasure": 0.07642268302285708, "rougeLsum_fmeasure_stderr": 0.0012969083304177709, "rougeLsum_precision": 0.05023967957795977, "rougeLsum_precision_stderr": 0.0013391404654021308, "rougeLsum_recall": 0.2744128922463123, "rougeLsum_recall_stderr": 0.004244489606651636}}, "3": {"PALM_prompt": {"bleu": 0.38107906362162436, "bleu_stderr": 0.024991969242556415, "rouge1_fmeasure": 0.08419045686370444, "rouge1_fmeasure_stderr": 0.0014216699878185407, "rouge1_precision": 0.054415098047743866, "rouge1_precision_stderr": 0.0013107766774520463, "rouge1_recall": 0.30162861434179955, "rouge1_recall_stderr": 0.004713531273257677, "rouge2_fmeasure": 0.030257065984893673, "rouge2_fmeasure_stderr": 0.0008278096587740237, "rouge2_precision": 0.0199239942581258, "rouge2_precision_stderr": 0.0008735683263287026, "rouge2_recall": 0.1163286060538195, "rouge2_recall_stderr": 0.003027064382318236, "rougeL_fmeasure": 0.07233738236760087, "rougeL_fmeasure_stderr": 0.0012044974617501706, "rougeL_precision": 0.04673499208005788, "rougeL_precision_stderr": 0.0011514714293288978, "rougeL_recall": 0.26145337399109125, "rougeL_recall_stderr": 0.0040828368504070775, "rougeLsum_fmeasure": 0.07685042082570372, "rougeLsum_fmeasure_stderr": 0.0012953603839685258, "rougeLsum_precision": 0.04978601388868997, "rougeLsum_precision_stderr": 0.0012270851126650545, "rougeLsum_recall": 0.27583792528118956, "rougeLsum_recall_stderr": 0.00430618213345443}}, "4": {"PALM_prompt": {"bleu": 0.4331818134565809, "bleu_stderr": 0.025708925934709085, "rouge1_fmeasure": 0.08824735281130963, "rouge1_fmeasure_stderr": 0.0015056089918678895, "rouge1_precision": 0.05682977063832145, "rouge1_precision_stderr": 0.0013062418403193361, "rouge1_recall": 0.30963702160940254, "rouge1_recall_stderr": 0.004616297279931747, "rouge2_fmeasure": 0.03273432950647903, "rouge2_fmeasure_stderr": 0.0008786271694840897, "rouge2_precision": 0.02067339199582709, "rouge2_precision_stderr": 0.0006211781895994543, "rouge2_recall": 0.12114251297994481, "rouge2_recall_stderr": 0.0029719865408334013, "rougeL_fmeasure": 0.07481926067401642, "rougeL_fmeasure_stderr": 0.001255481182168781, "rougeL_precision": 0.048186112796526646, "rougeL_precision_stderr": 0.0011267275151557667, "rougeL_recall": 0.26515033109253616, "rougeL_recall_stderr": 0.003957808650334678, "rougeLsum_fmeasure": 0.0804810777336425, "rougeLsum_fmeasure_stderr": 0.0013743352875061586, "rougeLsum_precision": 0.05191748358550353, "rougeLsum_precision_stderr": 0.0012160686690536905, "rougeLsum_recall": 0.282558120078832, "rougeLsum_recall_stderr": 0.004184430332361501}}, "5": {"PALM_prompt": {"bleu": 0.4312342963741201, "bleu_stderr": 0.023463474792083883, "rouge1_fmeasure": 0.08698934245417853, "rouge1_fmeasure_stderr": 0.0014447952844142498, "rouge1_precision": 0.05607921390768092, "rouge1_precision_stderr": 0.00129031016961261, "rouge1_recall": 0.3085676515081395, "rouge1_recall_stderr": 0.0048221395339639395, "rouge2_fmeasure": 0.03173794735917154, "rouge2_fmeasure_stderr": 0.0008250062200069443, "rouge2_precision": 0.020422249942234363, "rouge2_precision_stderr": 0.0006979964780737806, "rouge2_recall": 0.12130703535138149, "rouge2_recall_stderr": 0.0029860752254612147, "rougeL_fmeasure": 0.07432779731546756, "rougeL_fmeasure_stderr": 0.001208657695694648, "rougeL_precision": 0.04788258684945666, "rougeL_precision_stderr": 0.001108630006404901, "rougeL_recall": 0.26586011232142376, "rougeL_recall_stderr": 0.00411799813737337, "rougeLsum_fmeasure": 0.07929139476822122, "rougeLsum_fmeasure_stderr": 0.0013133571132340437, "rougeLsum_precision": 0.051155183077276144, "rougeLsum_precision_stderr": 0.001198596544003982, "rougeLsum_recall": 0.2815842896389388, "rougeLsum_recall_stderr": 0.0043688508519031674}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.0764459956048447, "bleu_stderr": 0.05327837785064096, "rouge1_fmeasure": 0.16144716883961532, "rouge1_fmeasure_stderr": 0.0017539528721051748, "rouge1_precision": 0.14589422295372542, "rouge1_precision_stderr": 0.001933223614546612, "rouge1_recall": 0.22498179131269405, "rouge1_recall_stderr": 0.002425048789322509, "rouge2_fmeasure": 0.02574727220612488, "rouge2_fmeasure_stderr": 0.0006915295697930535, "rouge2_precision": 0.02286378356102028, "rouge2_precision_stderr": 0.0006479174463346086, "rouge2_recall": 0.03652991399443741, "rouge2_recall_stderr": 0.0010372486460884672, "rougeL_fmeasure": 0.12418496671845956, "rougeL_fmeasure_stderr": 0.0012505052342254785, "rougeL_precision": 0.11153515740515704, "rougeL_precision_stderr": 0.0013946384289124252, "rougeL_recall": 0.17653848259138974, "rougeL_recall_stderr": 0.0019101736726851136, "rougeLsum_fmeasure": 0.15086987639868266, "rougeLsum_fmeasure_stderr": 0.001627431834107335, "rougeLsum_precision": 0.13604617460758534, "rougeLsum_precision_stderr": 0.0017813911055273796, "rougeLsum_recall": 0.21098478246265154, "rougeLsum_recall_stderr": 0.002275378121859843}}, "1": {"tldr_en": {"bleu": 1.2141791330973613, "bleu_stderr": 0.04697019036643873, "rouge1_fmeasure": 0.16954134084619948, "rouge1_fmeasure_stderr": 0.001706665501240968, "rouge1_precision": 0.14685621763710516, "rouge1_precision_stderr": 0.0018020908876800418, "rouge1_recall": 0.24288741031053673, "rouge1_recall_stderr": 0.0023907512457460364, "rouge2_fmeasure": 0.02701999493867571, "rouge2_fmeasure_stderr": 0.0007146700305296079, "rouge2_precision": 0.023349809665081732, "rouge2_precision_stderr": 0.0006355394861618253, "rouge2_recall": 0.03883170228748695, "rouge2_recall_stderr": 0.0011313425448321853, "rougeL_fmeasure": 0.1234021038771501, "rougeL_fmeasure_stderr": 0.0011464127230771072, "rougeL_precision": 0.10574797190866989, "rougeL_precision_stderr": 0.0011973517445978304, "rougeL_recall": 0.18118744215711277, "rougeL_recall_stderr": 0.0018103543901955745, "rougeLsum_fmeasure": 0.1602464043520148, "rougeLsum_fmeasure_stderr": 0.0016072662223249824, "rougeLsum_precision": 0.13879863182886407, "rougeLsum_precision_stderr": 0.0017014251413245724, "rougeLsum_recall": 0.2298440453442261, "rougeLsum_recall_stderr": 0.0022564218187161116}}, "2": {"tldr_en": {"bleu": 1.3103535658863046, "bleu_stderr": 0.06703555679096626, "rouge1_fmeasure": 0.173699519351746, "rouge1_fmeasure_stderr": 0.0017214200105041546, "rouge1_precision": 0.15031636274490512, "rouge1_precision_stderr": 0.001859043608657752, "rouge1_recall": 0.24880630967473474, "rouge1_recall_stderr": 0.002330740982778701, "rouge2_fmeasure": 0.027716761017832728, "rouge2_fmeasure_stderr": 0.0007095562395488355, "rouge2_precision": 0.024162595576083717, "rouge2_precision_stderr": 0.000670917137851054, "rouge2_recall": 0.03980552616737724, "rouge2_recall_stderr": 0.0011089835034636026, "rougeL_fmeasure": 0.12687990534486876, "rougeL_fmeasure_stderr": 0.0011599894493513059, "rougeL_precision": 0.10847967070055685, "rougeL_precision_stderr": 0.0012390469014129546, "rougeL_recall": 0.18664977766855087, "rougeL_recall_stderr": 0.001808642652682323, "rougeLsum_fmeasure": 0.16421612208598274, "rougeLsum_fmeasure_stderr": 0.0016135741695381676, "rougeLsum_precision": 0.1419859602079781, "rougeLsum_precision_stderr": 0.0017437945295697102, "rougeLsum_recall": 0.23587212383147646, "rougeLsum_recall_stderr": 0.00221789102407127}}, "3": {"tldr_en": {"bleu": 1.3739899059482077, "bleu_stderr": 0.054100401167793986, "rouge1_fmeasure": 0.14662126517098895, "rouge1_fmeasure_stderr": 0.0019333709539952743, "rouge1_precision": 0.1295595146847216, "rouge1_precision_stderr": 0.0019896357220441216, "rouge1_recall": 0.2102162328614198, "rouge1_recall_stderr": 0.00272074003902666, "rouge2_fmeasure": 0.023695385083786366, "rouge2_fmeasure_stderr": 0.0006858655898430962, "rouge2_precision": 0.02077488813069694, "rouge2_precision_stderr": 0.0006455414489339597, "rouge2_recall": 0.03482090895899887, "rouge2_recall_stderr": 0.0011168543506774758, "rougeL_fmeasure": 0.10749243853435972, "rougeL_fmeasure_stderr": 0.0013482432392331297, "rougeL_precision": 0.09414740497672094, "rougeL_precision_stderr": 0.0013816532310767095, "rougeL_recall": 0.15831582272893086, "rougeL_recall_stderr": 0.002114859561711096, "rougeLsum_fmeasure": 0.13858801649091676, "rougeLsum_fmeasure_stderr": 0.0018186012574435816, "rougeLsum_precision": 0.12237496223902421, "rougeLsum_precision_stderr": 0.0018788789863744247, "rougeLsum_recall": 0.19935578071938537, "rougeLsum_recall_stderr": 0.002589060276145499}}, "4": {"tldr_en": {"bleu": 0.265475079518499, "bleu_stderr": 0.030804609116683257, "rouge1_fmeasure": 0.04868744260399047, "rouge1_fmeasure_stderr": 0.0016566640859273175, "rouge1_precision": 0.0448748212472036, "rouge1_precision_stderr": 0.001674215380433797, "rouge1_recall": 0.07210403360248313, "rouge1_recall_stderr": 0.002478770057973383, "rouge2_fmeasure": 0.007893612316120567, "rouge2_fmeasure_stderr": 0.0004304882485780855, "rouge2_precision": 0.007076838117289068, "rouge2_precision_stderr": 0.0004075719557853454, "rouge2_recall": 0.012100463711250292, "rouge2_recall_stderr": 0.0007346064920215873, "rougeL_fmeasure": 0.03632389232899611, "rougeL_fmeasure_stderr": 0.001208391229822389, "rougeL_precision": 0.033497722351980966, "rougeL_precision_stderr": 0.0012411289314971272, "rougeL_recall": 0.05488496968515189, "rougeL_recall_stderr": 0.0018940926518647451, "rougeLsum_fmeasure": 0.04574337516533684, "rougeLsum_fmeasure_stderr": 0.0015497628634815858, "rougeLsum_precision": 0.04213647007745186, "rougeLsum_precision_stderr": 0.0015665560018835252, "rougeLsum_recall": 0.06802126946110762, "rougeLsum_recall_stderr": 0.0023435441607120573}}, "5": {"tldr_en": {"bleu": 4.713742098090827e-07, "bleu_stderr": 8.231428916056322e-07, "rouge1_fmeasure": 0.00756286892924481, "rouge1_fmeasure_stderr": 0.0007193404889854571, "rouge1_precision": 0.007171429338173016, "rouge1_precision_stderr": 0.0007330884682225905, "rouge1_recall": 0.011175873837651928, "rouge1_recall_stderr": 0.001074090847329161, "rouge2_fmeasure": 0.0012442077120205612, "rouge2_fmeasure_stderr": 0.0001840587305723463, "rouge2_precision": 0.0011688760279436425, "rouge2_precision_stderr": 0.0001851681315162851, "rouge2_recall": 0.0019735899949440455, "rouge2_recall_stderr": 0.00033046579444616326, "rougeL_fmeasure": 0.0057573990953003916, "rougeL_fmeasure_stderr": 0.0005453472217137533, "rougeL_precision": 0.005420689822502483, "rougeL_precision_stderr": 0.0005525794775771863, "rougeL_recall": 0.00874074158624237, "rougeL_recall_stderr": 0.0008628947049986887, "rougeLsum_fmeasure": 0.007164811847137944, "rougeLsum_fmeasure_stderr": 0.0006831070127182647, "rougeLsum_precision": 0.006766801959955348, "rougeLsum_precision_stderr": 0.0006944542821860396, "rougeLsum_recall": 0.010657190953912812, "rougeLsum_recall_stderr": 0.00103038009829335}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.036606993302532996, "bleu_stderr": 0.01113301029336236, "rouge1_fmeasure": 0.02458382755004403, "rouge1_fmeasure_stderr": 0.0010254012766460537, "rouge1_precision": 0.03183389849383117, "rouge1_precision_stderr": 0.0015589242812464792, "rouge1_recall": 0.02948861020047669, "rouge1_recall_stderr": 0.0013144174127028803, "rouge2_fmeasure": 0.002908975138220329, "rouge2_fmeasure_stderr": 0.00031565035988718033, "rouge2_precision": 0.004485319895648124, "rouge2_precision_stderr": 0.0006816305933273163, "rouge2_recall": 0.0036961481248359044, "rouge2_recall_stderr": 0.0003573973074328888, "rougeL_fmeasure": 0.02132102463633844, "rougeL_fmeasure_stderr": 0.0008749150372380813, "rougeL_precision": 0.02731913704467503, "rougeL_precision_stderr": 0.0013507901206105252, "rougeL_recall": 0.026275733553497133, "rougeL_recall_stderr": 0.001183493473226699, "rougeLsum_fmeasure": 0.021324873124249733, "rougeLsum_fmeasure_stderr": 0.0008909548716459579, "rougeLsum_precision": 0.027900086870844505, "rougeLsum_precision_stderr": 0.001395723733359835, "rougeLsum_recall": 0.02553088411607409, "rougeLsum_recall_stderr": 0.00114694369013677}}, "1": {"generate_text_restaurant": {"bleu": 4.678693262126458, "bleu_stderr": 0.06613674533498272, "rouge1_fmeasure": 0.28613774190701086, "rouge1_fmeasure_stderr": 0.002069532514655764, "rouge1_precision": 0.2597098670931843, "rouge1_precision_stderr": 0.0026927171493564064, "rouge1_recall": 0.38988900035382545, "rouge1_recall_stderr": 0.003027637833803093, "rouge2_fmeasure": 0.10381155313855887, "rouge2_fmeasure_stderr": 0.001332832825451982, "rouge2_precision": 0.0964724101139362, "rouge2_precision_stderr": 0.001795173815714425, "rouge2_recall": 0.14384296252423778, "rouge2_recall_stderr": 0.0019178044164720822, "rougeL_fmeasure": 0.20495840809818508, "rougeL_fmeasure_stderr": 0.0014880433666955757, "rougeL_precision": 0.18825917514029195, "rougeL_precision_stderr": 0.002218416803767207, "rougeL_recall": 0.28128434138348024, "rougeL_recall_stderr": 0.0023186024669175154, "rougeLsum_fmeasure": 0.23913411907532356, "rougeLsum_fmeasure_stderr": 0.0018329968151256037, "rougeLsum_precision": 0.2184303899074934, "rougeLsum_precision_stderr": 0.0024598467319383306, "rougeLsum_recall": 0.32617134969099115, "rougeLsum_recall_stderr": 0.0026977834592149114}}, "2": {"generate_text_restaurant": {"bleu": 4.98015134485123, "bleu_stderr": 0.1061276452705762, "rouge1_fmeasure": 0.29329457428793704, "rouge1_fmeasure_stderr": 0.002006934490011619, "rouge1_precision": 0.26019264842639633, "rouge1_precision_stderr": 0.0025553754068955393, "rouge1_recall": 0.40725640152585113, "rouge1_recall_stderr": 0.002901392043264379, "rouge2_fmeasure": 0.1139949279976248, "rouge2_fmeasure_stderr": 0.0013387487193541924, "rouge2_precision": 0.10185500510639955, "rouge2_precision_stderr": 0.0015956215394903554, "rouge2_recall": 0.1617029433014766, "rouge2_recall_stderr": 0.0019581481050293366, "rougeL_fmeasure": 0.2167723194466543, "rougeL_fmeasure_stderr": 0.001499235830757627, "rougeL_precision": 0.1924593116610896, "rougeL_precision_stderr": 0.0019737922363412316, "rougeL_recall": 0.30374736639663047, "rougeL_recall_stderr": 0.0023356738790004934, "rougeLsum_fmeasure": 0.24551196878226098, "rougeLsum_fmeasure_stderr": 0.0018284025722243385, "rougeLsum_precision": 0.2184595008139227, "rougeLsum_precision_stderr": 0.0022920206320671383, "rougeLsum_recall": 0.3409414958760342, "rougeLsum_recall_stderr": 0.0026545706111110235}}, "3": {"generate_text_restaurant": {"bleu": 5.1134618329721855, "bleu_stderr": 0.102874983458279, "rouge1_fmeasure": 0.29572458104375104, "rouge1_fmeasure_stderr": 0.0019652234288593832, "rouge1_precision": 0.2609773741465744, "rouge1_precision_stderr": 0.002537950159211429, "rouge1_recall": 0.4140766245954641, "rouge1_recall_stderr": 0.0028576982494730894, "rouge2_fmeasure": 0.11696994172728126, "rouge2_fmeasure_stderr": 0.001352602075676608, "rouge2_precision": 0.10343343971596863, "rouge2_precision_stderr": 0.0015790066279702149, "rouge2_recall": 0.1672186143762521, "rouge2_recall_stderr": 0.001978175616342694, "rougeL_fmeasure": 0.22247988536181315, "rougeL_fmeasure_stderr": 0.001502714121390009, "rougeL_precision": 0.1960049180295718, "rougeL_precision_stderr": 0.001955654949336797, "rougeL_recall": 0.31480061321089475, "rougeL_recall_stderr": 0.0023792872562502137, "rougeLsum_fmeasure": 0.24885766889305827, "rougeLsum_fmeasure_stderr": 0.0018115549738266202, "rougeLsum_precision": 0.21983398684366603, "rougeLsum_precision_stderr": 0.0022588619429364937, "rougeLsum_recall": 0.34885033177824787, "rougeLsum_recall_stderr": 0.0026626060562648583}}, "4": {"generate_text_restaurant": {"bleu": 5.097936575265735, "bleu_stderr": 0.1163020571573472, "rouge1_fmeasure": 0.2967462841366472, "rouge1_fmeasure_stderr": 0.0019286200261761643, "rouge1_precision": 0.25429729638850906, "rouge1_precision_stderr": 0.0023695679038110594, "rouge1_recall": 0.42161699620712306, "rouge1_recall_stderr": 0.0027872475333441263, "rouge2_fmeasure": 0.11758078614483855, "rouge2_fmeasure_stderr": 0.0013346582963502385, "rouge2_precision": 0.100614248301458, "rouge2_precision_stderr": 0.0014666707461216508, "rouge2_recall": 0.17125754371663027, "rouge2_recall_stderr": 0.001994092888477835, "rougeL_fmeasure": 0.22477013379555477, "rougeL_fmeasure_stderr": 0.001493218841097286, "rougeL_precision": 0.1921733066197307, "rougeL_precision_stderr": 0.0018455693729571333, "rougeL_recall": 0.3224506563390446, "rougeL_recall_stderr": 0.0023276972975120997, "rougeLsum_fmeasure": 0.24865174486699512, "rougeLsum_fmeasure_stderr": 0.00179229504075205, "rougeLsum_precision": 0.21352027571708337, "rougeLsum_precision_stderr": 0.002150915494952749, "rougeLsum_recall": 0.3531480150475261, "rougeLsum_recall_stderr": 0.002562966485962937}}, "5": {"generate_text_restaurant": {"bleu": 5.062788997280982, "bleu_stderr": 0.08822464519858932, "rouge1_fmeasure": 0.29664109112927334, "rouge1_fmeasure_stderr": 0.0019095371610719598, "rouge1_precision": 0.2498449006336628, "rouge1_precision_stderr": 0.002213421916218723, "rouge1_recall": 0.4252530010462427, "rouge1_recall_stderr": 0.0027514974146171283, "rouge2_fmeasure": 0.11693768290607841, "rouge2_fmeasure_stderr": 0.0013038062330121474, "rouge2_precision": 0.0977004338831294, "rouge2_precision_stderr": 0.0013481487988526364, "rouge2_recall": 0.17202446537862454, "rouge2_recall_stderr": 0.001957500600958048, "rougeL_fmeasure": 0.22391656278907518, "rougeL_fmeasure_stderr": 0.0014549429363056052, "rougeL_precision": 0.18806790839367102, "rougeL_precision_stderr": 0.0016881978802737166, "rougeL_recall": 0.3242210598818556, "rougeL_recall_stderr": 0.002295317002498033, "rougeLsum_fmeasure": 0.2494187904059303, "rougeLsum_fmeasure_stderr": 0.0017781290251660277, "rougeLsum_precision": 0.21049812949791483, "rougeLsum_precision_stderr": 0.0020139830412377233, "rougeLsum_recall": 0.35756000644951874, "rougeLsum_recall_stderr": 0.002573449451097643}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0816539670462249, "bleu_stderr": 0.08724954590423005, "rouge1_fmeasure": 0.18405590265991822, "rouge1_fmeasure_stderr": 0.002093935016369007, "rouge1_precision": 0.13397748922052297, "rouge1_precision_stderr": 0.0017221282101016954, "rouge1_recall": 0.31561679156843275, "rouge1_recall_stderr": 0.0036133496348654632, "rouge2_fmeasure": 0.03222831347895068, "rouge2_fmeasure_stderr": 0.0011696062528252086, "rouge2_precision": 0.023321500499379384, "rouge2_precision_stderr": 0.0008839339925666061, "rouge2_recall": 0.05642092599944886, "rouge2_recall_stderr": 0.002042521633667637, "rougeL_fmeasure": 0.13966403343939907, "rougeL_fmeasure_stderr": 0.0015682097025264798, "rougeL_precision": 0.10151652695106225, "rougeL_precision_stderr": 0.001294335208798664, "rougeL_recall": 0.24052070004338394, "rougeL_recall_stderr": 0.0027711563594350164, "rougeLsum_fmeasure": 0.1472192579999028, "rougeLsum_fmeasure_stderr": 0.0017498858887307208, "rougeLsum_precision": 0.1068747708803298, "rougeLsum_precision_stderr": 0.001404213791983037, "rougeLsum_recall": 0.25411251343894836, "rougeLsum_recall_stderr": 0.0031460702347451573}}, "1": {"article_DOC_summary": {"bleu": 0.7350212414590028, "bleu_stderr": 0.10275876779706032, "rouge1_fmeasure": 0.15500067629764053, "rouge1_fmeasure_stderr": 0.0020570253076746124, "rouge1_precision": 0.11006179596471638, "rouge1_precision_stderr": 0.0015414817648688943, "rouge1_recall": 0.27289246422382785, "rouge1_recall_stderr": 0.0034787353666610034, "rouge2_fmeasure": 0.021368328865681116, "rouge2_fmeasure_stderr": 0.0009312648057298122, "rouge2_precision": 0.015088299527624017, "rouge2_precision_stderr": 0.0006600105576374055, "rouge2_recall": 0.03829247963850394, "rouge2_recall_stderr": 0.0017336390271454221, "rougeL_fmeasure": 0.11900981663512405, "rougeL_fmeasure_stderr": 0.0014900097057684348, "rougeL_precision": 0.08432479209074074, "rougeL_precision_stderr": 0.001104941199216902, "rougeL_recall": 0.21093725176366887, "rougeL_recall_stderr": 0.00266374298942473, "rougeLsum_fmeasure": 0.12484888940634106, "rougeLsum_fmeasure_stderr": 0.0016439788546846778, "rougeLsum_precision": 0.08849518379965306, "rougeLsum_precision_stderr": 0.0012207706993141592, "rougeLsum_recall": 0.22106538442254517, "rougeLsum_recall_stderr": 0.002893092987771549}}, "2": {"article_DOC_summary": {"bleu": 0.673385910239856, "bleu_stderr": 0.10534322822067065, "rouge1_fmeasure": 0.15144328529509785, "rouge1_fmeasure_stderr": 0.0019756529400794527, "rouge1_precision": 0.10722514169000213, "rouge1_precision_stderr": 0.0014813014832035488, "rouge1_recall": 0.26855563800035837, "rouge1_recall_stderr": 0.0033368174601433416, "rouge2_fmeasure": 0.01942353387393521, "rouge2_fmeasure_stderr": 0.0008819112840945849, "rouge2_precision": 0.013668659442788302, "rouge2_precision_stderr": 0.0006262412593550217, "rouge2_recall": 0.034981717069404845, "rouge2_recall_stderr": 0.0015923131588300374, "rougeL_fmeasure": 0.11627413938563175, "rougeL_fmeasure_stderr": 0.0014000407494461665, "rougeL_precision": 0.082156295735148, "rougeL_precision_stderr": 0.0010485242556346193, "rougeL_recall": 0.20761435478139378, "rougeL_recall_stderr": 0.002460728110517762, "rougeLsum_fmeasure": 0.12235342750033794, "rougeLsum_fmeasure_stderr": 0.0015792516207881514, "rougeLsum_precision": 0.08641713483693537, "rougeLsum_precision_stderr": 0.0011705501188342233, "rougeLsum_recall": 0.218537718450872, "rougeLsum_recall_stderr": 0.002793898927671036}}, "3": {"article_DOC_summary": {"bleu": 0.7670612295185001, "bleu_stderr": 0.09701074893555275, "rouge1_fmeasure": 0.14387149738404553, "rouge1_fmeasure_stderr": 0.00210303022205832, "rouge1_precision": 0.10413682110658641, "rouge1_precision_stderr": 0.0016784647398663506, "rouge1_recall": 0.2514772461229513, "rouge1_recall_stderr": 0.003650553804321883, "rouge2_fmeasure": 0.018553764265445295, "rouge2_fmeasure_stderr": 0.0009433234829795867, "rouge2_precision": 0.013566692417235222, "rouge2_precision_stderr": 0.000796154080064481, "rouge2_recall": 0.03302434368315226, "rouge2_recall_stderr": 0.001677560755372379, "rougeL_fmeasure": 0.11040492071538352, "rougeL_fmeasure_stderr": 0.0015652279630802186, "rougeL_precision": 0.07994742139541028, "rougeL_precision_stderr": 0.0012884226136490953, "rougeL_recall": 0.19415168515691525, "rougeL_recall_stderr": 0.0028260218859185223, "rougeLsum_fmeasure": 0.11680119161199647, "rougeLsum_fmeasure_stderr": 0.00169045771629767, "rougeLsum_precision": 0.0844817850298531, "rougeLsum_precision_stderr": 0.0013664749055213902, "rougeLsum_recall": 0.2051833918175802, "rougeLsum_recall_stderr": 0.0030204798202167933}}, "4": {"article_DOC_summary": {"bleu": 0.4110074594603615, "bleu_stderr": 0.1524135216676489, "rouge1_fmeasure": 0.03781462774277502, "rouge1_fmeasure_stderr": 0.0021226111468936023, "rouge1_precision": 0.02993988672009351, "rouge1_precision_stderr": 0.001753644635646736, "rouge1_recall": 0.061847859182304815, "rouge1_recall_stderr": 0.0035429410478515643, "rouge2_fmeasure": 0.004421681388038414, "rouge2_fmeasure_stderr": 0.0005244296759673828, "rouge2_precision": 0.0031752446929287676, "rouge2_precision_stderr": 0.0003785048475187664, "rouge2_recall": 0.007793401558568678, "rouge2_recall_stderr": 0.0009379549570744482, "rougeL_fmeasure": 0.02919308751146221, "rougeL_fmeasure_stderr": 0.0016175019079971236, "rougeL_precision": 0.023309073821668826, "rougeL_precision_stderr": 0.0013823605386954466, "rougeL_recall": 0.04794726685498152, "rougeL_recall_stderr": 0.002743427614049344, "rougeLsum_fmeasure": 0.030959481702781236, "rougeLsum_fmeasure_stderr": 0.0017219014413595837, "rougeLsum_precision": 0.024576307863299156, "rougeLsum_precision_stderr": 0.0014454785059669487, "rougeLsum_recall": 0.05093556769434034, "rougeLsum_recall_stderr": 0.002917563906568682}}, "5": {"article_DOC_summary": {"bleu": 3.122667255329593e-39, "bleu_stderr": 1.4163203880943935e-33, "rouge1_fmeasure": 0.0018611525353231337, "rouge1_fmeasure_stderr": 0.000530724648510803, "rouge1_precision": 0.002092981651148016, "rouge1_precision_stderr": 0.0005947327285516237, "rouge1_recall": 0.0017696638624373567, "rouge1_recall_stderr": 0.0005156436824023952, "rouge2_fmeasure": 0.00018162755947646124, "rouge2_fmeasure_stderr": 0.00011585640694731024, "rouge2_precision": 0.00021731875227990654, "rouge2_precision_stderr": 0.00013058292437936588, "rouge2_recall": 0.0001637910738909629, "rouge2_recall_stderr": 0.00011028119608118236, "rougeL_fmeasure": 0.0015640884506811852, "rougeL_fmeasure_stderr": 0.0004448000326863294, "rougeL_precision": 0.0017399440441820257, "rougeL_precision_stderr": 0.00048632447309756205, "rougeL_recall": 0.0015083411463478616, "rougeL_recall_stderr": 0.00044739710011738366, "rougeLsum_fmeasure": 0.0016422883144728158, "rougeLsum_fmeasure_stderr": 0.00046198998652202585, "rougeLsum_precision": 0.0018355316156735634, "rougeLsum_precision_stderr": 0.000513058706207189, "rougeLsum_recall": 0.0015735037329512717, "rougeLsum_recall_stderr": 0.00045676465318422743}}}} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_0.csv b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..61795ba46ba571162eaadd77ae3a6662e81b55c3 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928362,0 +anli_r2,acc,0.355,0.015139491543780529,0 +anli_r3,acc,0.3466666666666667,0.01374402255057195,0 +arc_challenge,acc,0.2150170648464164,0.012005717634133613,0 +arc_challenge,acc_norm,0.2508532423208191,0.012668198621315433,0 +arc_easy,acc,0.48274410774410775,0.01025367167475463,0 +arc_easy,acc_norm,0.4452861952861953,0.010198171137873854,0 +boolq,acc,0.5027522935779817,0.008744922485713836,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.3474500768049156,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.37422824138617805,0.004829339926388335,0 +hellaswag,acc_norm,0.45947022505477,0.004973361339169645,0 +piqa,acc,0.6991294885745375,0.010700745724145973,0 +piqa,acc_norm,0.6942328618063112,0.010749627366141644,0 +rte,acc,0.5415162454873647,0.029992535385373307,0 +sciq,acc,0.678,0.014782913600996667,0 +sciq,acc_norm,0.63,0.015275252316519364,0 +storycloze_2016,acc,0.6557990379476216,0.010986784958746032,0 +winogrande,acc,0.5027624309392266,0.014052271211616441,0 diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_0_lm-eval_global_step52452_2023-02-11-12-41-26_0shots_backup.json b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_0_lm-eval_global_step52452_2023-02-11-12-41-26_0shots_backup.json deleted file mode 100644 index 9290ed1aec0a3a560b78ec1d955ed511a014ff1b..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_0_lm-eval_global_step52452_2023-02-11-12-41-26_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928362 - }, - "anli_r2": { - "acc": 0.355, - "acc_stderr": 0.015139491543780529 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.01374402255057195 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.3474500768049156 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.37422824138617805, - "acc_stderr": 0.004829339926388335, - "acc_norm": 0.45947022505477, - "acc_norm_stderr": 0.004973361339169645 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373307 - }, - "winogrande": { - "acc": 0.5027624309392266, - "acc_stderr": 0.014052271211616441 - }, - "storycloze_2016": { - "acc": 0.6557990379476216, - "acc_stderr": 0.010986784958746032 - }, - "boolq": { - "acc": 0.5027522935779817, - "acc_stderr": 0.008744922485713836 - }, - "arc_easy": { - "acc": 0.48274410774410775, - "acc_stderr": 0.01025367167475463, - "acc_norm": 0.4452861952861953, - "acc_norm_stderr": 0.010198171137873854 - }, - "arc_challenge": { - "acc": 0.2150170648464164, - "acc_stderr": 0.012005717634133613, - "acc_norm": 0.2508532423208191, - "acc_norm_stderr": 0.012668198621315433 - }, - "sciq": { - "acc": 0.678, - "acc_stderr": 0.014782913600996667, - "acc_norm": 0.63, - "acc_norm_stderr": 0.015275252316519364 - }, - "piqa": { - "acc": 0.6991294885745375, - "acc_stderr": 0.010700745724145973, - "acc_norm": 0.6942328618063112, - "acc_norm_stderr": 0.010749627366141644 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_1.csv b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..9175ed2b796767699c6f20bde6fd9443cd512973 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.01505026612756444,0 +anli_r2,acc,0.358,0.015167928865407557,0 +anli_r3,acc,0.3591666666666667,0.013855141559780364,0 +arc_challenge,acc,0.22696245733788395,0.012240491536132865,0 +arc_challenge,acc_norm,0.24914675767918087,0.012639407111926435,0 +arc_easy,acc,0.468013468013468,0.010238767643185709,0 +arc_easy,acc_norm,0.43813131313131315,0.010180937100600074,0 +boolq,acc,0.4877675840978593,0.008742437504570407,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.426010101010101,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.3725353515236009,0.004824917516374189,0 +hellaswag,acc_norm,0.45598486357299345,0.004970410081009445,0 +piqa,acc,0.6784548422198041,0.010897500107575652,0 +piqa,acc_norm,0.6795429815016322,0.010887766073814888,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.672,0.01485384248727033,0 +sciq,acc_norm,0.658,0.01500870618212173,0 +storycloze_2016,acc,0.6509887760555852,0.011022640519108546,0 +winogrande,acc,0.5303867403314917,0.014026510839428746,0 diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json deleted file mode 100644 index 8d875ef3f79227f994ac184cee2edac210498e15..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_1_lm-eval_global_step52452_2023-02-11-12-41-26_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.01505026612756444 - }, - "anli_r2": { - "acc": 0.358, - "acc_stderr": 0.015167928865407557 - }, - "anli_r3": { - "acc": 0.3591666666666667, - "acc_stderr": 0.013855141559780364 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.426010101010101 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.3725353515236009, - "acc_stderr": 0.004824917516374189, - "acc_norm": 0.45598486357299345, - "acc_norm_stderr": 0.004970410081009445 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5303867403314917, - "acc_stderr": 0.014026510839428746 - }, - "storycloze_2016": { - "acc": 0.6509887760555852, - "acc_stderr": 0.011022640519108546 - }, - "boolq": { - "acc": 0.4877675840978593, - "acc_stderr": 0.008742437504570407 - }, - "arc_easy": { - "acc": 0.468013468013468, - "acc_stderr": 0.010238767643185709, - "acc_norm": 0.43813131313131315, - "acc_norm_stderr": 0.010180937100600074 - }, - "arc_challenge": { - "acc": 0.22696245733788395, - "acc_stderr": 0.012240491536132865, - "acc_norm": 0.24914675767918087, - "acc_norm_stderr": 0.012639407111926435 - }, - "sciq": { - "acc": 0.672, - "acc_stderr": 0.01485384248727033, - "acc_norm": 0.658, - "acc_norm_stderr": 0.01500870618212173 - }, - "piqa": { - "acc": 0.6784548422198041, - "acc_stderr": 0.010897500107575652, - "acc_norm": 0.6795429815016322, - "acc_norm_stderr": 0.010887766073814888 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_2.csv b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..fe8aa7d7fcbb592eae6d16b1f2d97308d71f5c53 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738863,0 +anli_r2,acc,0.347,0.01506047203170662,0 +anli_r3,acc,0.345,0.013728421539454869,0 +arc_challenge,acc,0.22184300341296928,0.012141659068147893,0 +arc_challenge,acc_norm,0.25,0.012653835621466646,0 +arc_easy,acc,0.4701178451178451,0.01024144432288643,0 +arc_easy,acc_norm,0.45580808080808083,0.010219631763437853,0 +boolq,acc,0.4932721712538226,0.00874426327382744,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.271358543417367,,1 +copa,acc,0.68,0.04688261722621504,0 +hellaswag,acc,0.3784106751643099,0.004839995745602318,0 +hellaswag,acc_norm,0.4556861183031269,0.0049701457081880104,0 +piqa,acc,0.6882480957562568,0.010807431424873674,0 +piqa,acc_norm,0.6860718171926007,0.010827928134189646,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.68,0.014758652303574886,0 +sciq,acc_norm,0.659,0.0149981313484027,0 +storycloze_2016,acc,0.6445750935328701,0.011068528452399877,0 +winogrande,acc,0.4988161010260458,0.014052446290529015,0 diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_2_lm-eval_global_step52452_2023-02-11-12-41-26_2shots_backup.json b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_2_lm-eval_global_step52452_2023-02-11-12-41-26_2shots_backup.json deleted file mode 100644 index 2ea774b0a8da79412a3d37773e3975f559e07c61..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_2_lm-eval_global_step52452_2023-02-11-12-41-26_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738863 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.01506047203170662 - }, - "anli_r3": { - "acc": 0.345, - "acc_stderr": 0.013728421539454869 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.271358543417367 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621504 - }, - "hellaswag": { - "acc": 0.3784106751643099, - "acc_stderr": 0.004839995745602318, - "acc_norm": 0.4556861183031269, - "acc_norm_stderr": 0.0049701457081880104 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.4988161010260458, - "acc_stderr": 0.014052446290529015 - }, - "storycloze_2016": { - "acc": 0.6445750935328701, - "acc_stderr": 0.011068528452399877 - }, - "boolq": { - "acc": 0.4932721712538226, - "acc_stderr": 0.00874426327382744 - }, - "arc_easy": { - "acc": 0.4701178451178451, - "acc_stderr": 0.01024144432288643, - "acc_norm": 0.45580808080808083, - "acc_norm_stderr": 0.010219631763437853 - }, - "arc_challenge": { - "acc": 0.22184300341296928, - "acc_stderr": 0.012141659068147893, - "acc_norm": 0.25, - "acc_norm_stderr": 0.012653835621466646 - }, - "sciq": { - "acc": 0.68, - "acc_stderr": 0.014758652303574886, - "acc_norm": 0.659, - "acc_norm_stderr": 0.0149981313484027 - }, - "piqa": { - "acc": 0.6882480957562568, - "acc_stderr": 0.010807431424873674, - "acc_norm": 0.6860718171926007, - "acc_norm_stderr": 0.010827928134189646 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_3.csv b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..5d95c39f314e454560e009246192440a1ff6a645 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.317,0.014721675438880224,0 +anli_r2,acc,0.356,0.015149042659306628,0 +anli_r3,acc,0.33916666666666667,0.013672343491681815,0 +arc_challenge,acc,0.2175767918088737,0.012057262020972502,0 +arc_challenge,acc_norm,0.2593856655290102,0.012808273573927102,0 +arc_easy,acc,0.4797979797979798,0.010251405621305368,0 +arc_easy,acc_norm,0.4541245791245791,0.010216507710244094,0 +boolq,acc,0.5110091743119266,0.008742934884517646,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.31302521008403367,,1 +copa,acc,0.65,0.0479372485441102,0 +hellaswag,acc,0.3723361880103565,0.004824393076826616,0 +hellaswag,acc_norm,0.4569806811392153,0.004971278309204194,0 +piqa,acc,0.6779107725788901,0.010902341695103446,0 +piqa,acc_norm,0.6844396082698585,0.01084311920175893,0 +rte,acc,0.555956678700361,0.029907396333795997,0 +sciq,acc,0.677,0.014794927843348633,0 +sciq,acc_norm,0.664,0.014944140233795028,0 +storycloze_2016,acc,0.6445750935328701,0.011068528452399877,0 +winogrande,acc,0.4940805051302289,0.014051500838485807,0 diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_3_lm-eval_global_step52452_2023-02-11-12-41-27_3shots_backup.json b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_3_lm-eval_global_step52452_2023-02-11-12-41-27_3shots_backup.json deleted file mode 100644 index ca00be04d2427dcac63d4f66ab34b4270f31a54e..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_3_lm-eval_global_step52452_2023-02-11-12-41-27_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.317, - "acc_stderr": 0.014721675438880224 - }, - "anli_r2": { - "acc": 0.356, - "acc_stderr": 0.015149042659306628 - }, - "anli_r3": { - "acc": 0.33916666666666667, - "acc_stderr": 0.013672343491681815 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.31302521008403367 - }, - "copa": { - "acc": 0.65, - "acc_stderr": 0.0479372485441102 - }, - "hellaswag": { - "acc": 0.3723361880103565, - "acc_stderr": 0.004824393076826616, - "acc_norm": 0.4569806811392153, - "acc_norm_stderr": 0.004971278309204194 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795997 - }, - "winogrande": { - "acc": 0.4940805051302289, - "acc_stderr": 0.014051500838485807 - }, - "storycloze_2016": { - "acc": 0.6445750935328701, - "acc_stderr": 0.011068528452399877 - }, - "boolq": { - "acc": 0.5110091743119266, - "acc_stderr": 0.008742934884517646 - }, - "arc_easy": { - "acc": 0.4797979797979798, - "acc_stderr": 0.010251405621305368, - "acc_norm": 0.4541245791245791, - "acc_norm_stderr": 0.010216507710244094 - }, - "arc_challenge": { - "acc": 0.2175767918088737, - "acc_stderr": 0.012057262020972502, - "acc_norm": 0.2593856655290102, - "acc_norm_stderr": 0.012808273573927102 - }, - "sciq": { - "acc": 0.677, - "acc_stderr": 0.014794927843348633, - "acc_norm": 0.664, - "acc_norm_stderr": 0.014944140233795028 - }, - "piqa": { - "acc": 0.6779107725788901, - "acc_stderr": 0.010902341695103446, - "acc_norm": 0.6844396082698585, - "acc_norm_stderr": 0.01084311920175893 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_4.csv b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..a9178cd3cc196172cc4244bead40d0448266abe0 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.322,0.014782913600996669,0 +anli_r2,acc,0.362,0.0152048409129195,0 +anli_r3,acc,0.3516666666666667,0.013789711695404794,0 +arc_challenge,acc,0.22013651877133106,0.01210812488346098,0 +arc_challenge,acc_norm,0.2551194539249147,0.012739038695202105,0 +arc_easy,acc,0.4604377104377104,0.010227616386289013,0 +arc_easy,acc_norm,0.45707070707070707,0.010221897564256056,0 +boolq,acc,0.5125382262996941,0.00874230497421832,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3189775910364146,,1 +copa,acc,0.67,0.04725815626252607,0 +hellaswag,acc,0.37054371639115713,0.004819633668832543,0 +hellaswag,acc_norm,0.4536944831706831,0.004968337144136362,0 +piqa,acc,0.675734494015234,0.01092153904134798,0 +piqa,acc_norm,0.6822633297062024,0.010863133246569292,0 +rte,acc,0.5667870036101083,0.029826764082138284,0 +sciq,acc,0.686,0.014683991951087966,0 +sciq,acc_norm,0.676,0.014806864733738859,0 +storycloze_2016,acc,0.6445750935328701,0.011068528452399877,0 +winogrande,acc,0.5169692186266772,0.014044390401612976,0 diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_4_lm-eval_global_step52452_2023-02-11-12-41-27_4shots_backup.json b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_4_lm-eval_global_step52452_2023-02-11-12-41-27_4shots_backup.json deleted file mode 100644 index c08a3ca54fb91c2fbc05afce6319e575244fc1b7..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_4_lm-eval_global_step52452_2023-02-11-12-41-27_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.322, - "acc_stderr": 0.014782913600996669 - }, - "anli_r2": { - "acc": 0.362, - "acc_stderr": 0.0152048409129195 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404794 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3189775910364146 - }, - "copa": { - "acc": 0.67, - "acc_stderr": 0.04725815626252607 - }, - "hellaswag": { - "acc": 0.37054371639115713, - "acc_stderr": 0.004819633668832543, - "acc_norm": 0.4536944831706831, - "acc_norm_stderr": 0.004968337144136362 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.029826764082138284 - }, - "winogrande": { - "acc": 0.5169692186266772, - "acc_stderr": 0.014044390401612976 - }, - "storycloze_2016": { - "acc": 0.6445750935328701, - "acc_stderr": 0.011068528452399877 - }, - "boolq": { - "acc": 0.5125382262996941, - "acc_stderr": 0.00874230497421832 - }, - "arc_easy": { - "acc": 0.4604377104377104, - "acc_stderr": 0.010227616386289013, - "acc_norm": 0.45707070707070707, - "acc_norm_stderr": 0.010221897564256056 - }, - "arc_challenge": { - "acc": 0.22013651877133106, - "acc_stderr": 0.01210812488346098, - "acc_norm": 0.2551194539249147, - "acc_norm_stderr": 0.012739038695202105 - }, - "sciq": { - "acc": 0.686, - "acc_stderr": 0.014683991951087966, - "acc_norm": 0.676, - "acc_norm_stderr": 0.014806864733738859 - }, - "piqa": { - "acc": 0.675734494015234, - "acc_stderr": 0.01092153904134798, - "acc_norm": 0.6822633297062024, - "acc_norm_stderr": 0.010863133246569292 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_5.csv b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..dff29e41e56821587a0990ed9befdcbd3b23ad74 --- /dev/null +++ b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.311,0.014645596385722694,0 +anli_r2,acc,0.37,0.015275252316519362,0 +anli_r3,acc,0.3333333333333333,0.013613950010225606,0 +arc_challenge,acc,0.2150170648464164,0.012005717634133611,0 +arc_challenge,acc_norm,0.25426621160409557,0.012724999945157748,0 +arc_easy,acc,0.4718013468013468,0.010243454104071788,0 +arc_easy,acc_norm,0.45286195286195285,0.010214087372211396,0 +boolq,acc,0.5073394495412844,0.00874411277680133,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.34532420984033885,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.3760207130053774,0.004833953712521773,0 +hellaswag,acc_norm,0.45518820952001593,0.004969701081068364,0 +piqa,acc,0.6822633297062024,0.010863133246569286,0 +piqa,acc_norm,0.6822633297062024,0.010863133246569292,0 +rte,acc,0.5379061371841155,0.030009848912529113,0 +sciq,acc,0.678,0.014782913600996676,0 +sciq,acc_norm,0.676,0.014806864733738856,0 +storycloze_2016,acc,0.6493853554249065,0.011034317290463292,0 +winogrande,acc,0.5169692186266772,0.014044390401612976,0 diff --git a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_5_lm-eval_global_step52452_2023-02-11-12-41-27_5shots_backup.json b/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_5_lm-eval_global_step52452_2023-02-11-12-41-27_5shots_backup.json deleted file mode 100644 index 12b1a71b6515a52990457874823aaa3939fa206e..0000000000000000000000000000000000000000 --- a/2b855b1b25c4seed4/evaluation/rankeval/2b855b1b25c4seed4_5_lm-eval_global_step52452_2023-02-11-12-41-27_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.311, - "acc_stderr": 0.014645596385722694 - }, - "anli_r2": { - "acc": 0.37, - "acc_stderr": 0.015275252316519362 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.013613950010225606 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.34532420984033885 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - }, - "hellaswag": { - "acc": 0.3760207130053774, - "acc_stderr": 0.004833953712521773, - "acc_norm": 0.45518820952001593, - "acc_norm_stderr": 0.004969701081068364 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.5169692186266772, - "acc_stderr": 0.014044390401612976 - }, - "storycloze_2016": { - "acc": 0.6493853554249065, - "acc_stderr": 0.011034317290463292 - }, - "boolq": { - "acc": 0.5073394495412844, - "acc_stderr": 0.00874411277680133 - }, - "arc_easy": { - "acc": 0.4718013468013468, - "acc_stderr": 0.010243454104071788, - "acc_norm": 0.45286195286195285, - "acc_norm_stderr": 0.010214087372211396 - }, - "arc_challenge": { - "acc": 0.2150170648464164, - "acc_stderr": 0.012005717634133611, - "acc_norm": 0.25426621160409557, - "acc_norm_stderr": 0.012724999945157748 - }, - "sciq": { - "acc": 0.678, - "acc_stderr": 0.014782913600996676, - "acc_norm": 0.676, - "acc_norm_stderr": 0.014806864733738856 - }, - "piqa": { - "acc": 0.6822633297062024, - "acc_stderr": 0.010863133246569286, - "acc_norm": 0.6822633297062024, - "acc_norm_stderr": 0.010863133246569292 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_0.csv b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..e8fe6a5514e9daed6be9846fe6e0892fce667af9 --- /dev/null +++ b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.345,0.015039986742055237,0 +anli_r2,acc,0.364,0.015222868840522022,0 +anli_r3,acc,0.35333333333333333,0.013804572162314928,0 +arc_challenge,acc,0.24744027303754265,0.012610352663292673,0 +arc_challenge,acc_norm,0.28754266211604096,0.013226719056266132,0 +arc_easy,acc,0.5669191919191919,0.010167478013701787,0 +arc_easy,acc_norm,0.5008417508417509,0.010259768981815234,0 +boolq,acc,0.5920489296636086,0.008595583792654907,1 +cb,acc,0.44642857142857145,0.067031892279424,1 +cb,f1,0.3011063011063011,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.43796056562437763,0.004951222171763112,0 +hellaswag,acc_norm,0.563433578968333,0.00494946256368134,0 +piqa,acc,0.7437431991294886,0.010185787831565062,0 +piqa,acc_norm,0.7459194776931447,0.010157271999135044,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.806,0.012510816141264366,0 +sciq,acc_norm,0.727,0.014095022868717591,0 +storycloze_2016,acc,0.6916087653661144,0.010679734445487796,0 +winogrande,acc,0.55327545382794,0.013972488371616701,0 diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index 81f5f4fd2d08bb4875689dfb6d7fc7737a99d83c..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r2": { - "acc": 0.364, - "acc_stderr": 0.015222868840522022 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314928 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.067031892279424, - "f1": 0.3011063011063011 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.43796056562437763, - "acc_stderr": 0.004951222171763112, - "acc_norm": 0.563433578968333, - "acc_norm_stderr": 0.00494946256368134 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.55327545382794, - "acc_stderr": 0.013972488371616701 - }, - "storycloze_2016": { - "acc": 0.6916087653661144, - "acc_stderr": 0.010679734445487796 - }, - "boolq": { - "acc": 0.5920489296636086, - "acc_stderr": 0.008595583792654907 - }, - "arc_easy": { - "acc": 0.5669191919191919, - "acc_stderr": 0.010167478013701787, - "acc_norm": 0.5008417508417509, - "acc_norm_stderr": 0.010259768981815234 - }, - "arc_challenge": { - "acc": 0.24744027303754265, - "acc_stderr": 0.012610352663292673, - "acc_norm": 0.28754266211604096, - "acc_norm_stderr": 0.013226719056266132 - }, - "sciq": { - "acc": 0.806, - "acc_stderr": 0.012510816141264366, - "acc_norm": 0.727, - "acc_norm_stderr": 0.014095022868717591 - }, - "piqa": { - "acc": 0.7437431991294886, - "acc_stderr": 0.010185787831565062, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.010157271999135044 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_1.csv b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..def4159b5379a3f5938ad4f0eb16eca9062b0d05 --- /dev/null +++ b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.32,0.014758652303574883,0 +anli_r2,acc,0.334,0.01492201952373296,0 +anli_r3,acc,0.3466666666666667,0.013744022550571949,0 +arc_challenge,acc,0.24914675767918087,0.012639407111926437,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0 +arc_easy,acc,0.585016835016835,0.01011038315196112,0 +arc_easy,acc_norm,0.5454545454545454,0.010217299762709417,0 +boolq,acc,0.5767584097859327,0.008641391399113588,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3510037264094242,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.43547102170882296,0.004948052131344499,0 +hellaswag,acc_norm,0.5638319059948218,0.004948952519517515,0 +piqa,acc,0.7399347116430903,0.010234893249061298,0 +piqa,acc_norm,0.735582154515778,0.010289787244767146,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.855,0.01113997751789013,0 +sciq,acc_norm,0.815,0.012285191326386696,0 +storycloze_2016,acc,0.6835916622127205,0.010754780097940887,0 +winogrande,acc,0.5501183898973955,0.01398171190404973,0 diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index 612064eae0e1df2b1b72c974861194c803c57006..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.32, - "acc_stderr": 0.014758652303574883 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.01492201952373296 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.013744022550571949 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3510037264094242 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.43547102170882296, - "acc_stderr": 0.004948052131344499, - "acc_norm": 0.5638319059948218, - "acc_norm_stderr": 0.004948952519517515 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.5501183898973955, - "acc_stderr": 0.01398171190404973 - }, - "storycloze_2016": { - "acc": 0.6835916622127205, - "acc_stderr": 0.010754780097940887 - }, - "boolq": { - "acc": 0.5767584097859327, - "acc_stderr": 0.008641391399113588 - }, - "arc_easy": { - "acc": 0.585016835016835, - "acc_stderr": 0.01011038315196112, - "acc_norm": 0.5454545454545454, - "acc_norm_stderr": 0.010217299762709417 - }, - "arc_challenge": { - "acc": 0.24914675767918087, - "acc_stderr": 0.012639407111926437, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252425 - }, - "sciq": { - "acc": 0.855, - "acc_stderr": 0.01113997751789013, - "acc_norm": 0.815, - "acc_norm_stderr": 0.012285191326386696 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.010234893249061298, - "acc_norm": 0.735582154515778, - "acc_norm_stderr": 0.010289787244767146 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_2.csv b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..28f4eccea3cea8593bf146d90034bca971dcf4a6 --- /dev/null +++ b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229859,0 +anli_r2,acc,0.347,0.015060472031706622,0 +anli_r3,acc,0.33916666666666667,0.013672343491681822,0 +arc_challenge,acc,0.2593856655290102,0.012808273573927102,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975578,0 +arc_easy,acc,0.5925925925925926,0.010082326627832865,0 +arc_easy,acc_norm,0.5631313131313131,0.010177672928157694,0 +boolq,acc,0.5752293577981651,0.008645503833361106,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.2851465474416294,,1 +copa,acc,0.78,0.04163331998932262,0 +hellaswag,acc,0.433379804819757,0.0049452912700724315,0 +hellaswag,acc_norm,0.5641306512646883,0.00494856785637386,0 +piqa,acc,0.7399347116430903,0.0102348932490613,0 +piqa,acc_norm,0.7383025027203483,0.010255630772708232,0 +rte,acc,0.49097472924187724,0.030091559826331334,0 +sciq,acc,0.871,0.010605256784796582,0 +sciq,acc_norm,0.839,0.011628164696727195,0 +storycloze_2016,acc,0.694815606627472,0.010648664383985663,0 +winogrande,acc,0.5493291239147593,0.01398392886904024,0 diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json deleted file mode 100644 index f206b0a883e5f6c1fbcb1c5684067e84b785df06..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229859 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706622 - }, - "anli_r3": { - "acc": 0.33916666666666667, - "acc_stderr": 0.013672343491681822 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.2851465474416294 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.433379804819757, - "acc_stderr": 0.0049452912700724315, - "acc_norm": 0.5641306512646883, - "acc_norm_stderr": 0.00494856785637386 - }, - "rte": { - "acc": 0.49097472924187724, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5493291239147593, - "acc_stderr": 0.01398392886904024 - }, - "storycloze_2016": { - "acc": 0.694815606627472, - "acc_stderr": 0.010648664383985663 - }, - "boolq": { - "acc": 0.5752293577981651, - "acc_stderr": 0.008645503833361106 - }, - "arc_easy": { - "acc": 0.5925925925925926, - "acc_stderr": 0.010082326627832865, - "acc_norm": 0.5631313131313131, - "acc_norm_stderr": 0.010177672928157694 - }, - "arc_challenge": { - "acc": 0.2593856655290102, - "acc_stderr": 0.012808273573927102, - "acc_norm": 0.28071672354948807, - "acc_norm_stderr": 0.013131238126975578 - }, - "sciq": { - "acc": 0.871, - "acc_stderr": 0.010605256784796582, - "acc_norm": 0.839, - "acc_norm_stderr": 0.011628164696727195 - }, - "piqa": { - "acc": 0.7399347116430903, - "acc_stderr": 0.0102348932490613, - "acc_norm": 0.7383025027203483, - "acc_norm_stderr": 0.010255630772708232 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_3.csv b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..ff86ff0925782ca5cf5833451ebc667b736389eb --- /dev/null +++ b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.35,0.015090650341444231,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.25170648464163825,0.01268249633404296,0 +arc_challenge,acc_norm,0.27986348122866894,0.013119040897725922,0 +arc_easy,acc,0.5829124579124579,0.010117738967781977,0 +arc_easy,acc_norm,0.5686026936026936,0.010162752847747501,0 +boolq,acc,0.581651376146789,0.008627661390825412,1 +cb,acc,0.5535714285714286,0.06703189227942397,1 +cb,f1,0.5272727272727272,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.4340768771161123,0.004946221512145278,0 +hellaswag,acc_norm,0.5663214499103765,0.004945691164810063,0 +piqa,acc,0.7431991294885746,0.010192864802278039,0 +piqa,acc_norm,0.7404787812840044,0.010227939888173923,0 +rte,acc,0.48736462093862815,0.030086851767188564,0 +sciq,acc,0.879,0.010318210380946088,0 +sciq,acc_norm,0.849,0.011328165223341673,0 +storycloze_2016,acc,0.6958845537145911,0.010638172655194792,0 +winogrande,acc,0.5485398579321231,0.013986110301017764,0 diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index 58c34247c852d46e6dfc8145831ebe43d9d85943..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444231 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942397, - "f1": 0.5272727272727272 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.4340768771161123, - "acc_stderr": 0.004946221512145278, - "acc_norm": 0.5663214499103765, - "acc_norm_stderr": 0.004945691164810063 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5485398579321231, - "acc_stderr": 0.013986110301017764 - }, - "storycloze_2016": { - "acc": 0.6958845537145911, - "acc_stderr": 0.010638172655194792 - }, - "boolq": { - "acc": 0.581651376146789, - "acc_stderr": 0.008627661390825412 - }, - "arc_easy": { - "acc": 0.5829124579124579, - "acc_stderr": 0.010117738967781977, - "acc_norm": 0.5686026936026936, - "acc_norm_stderr": 0.010162752847747501 - }, - "arc_challenge": { - "acc": 0.25170648464163825, - "acc_stderr": 0.01268249633404296, - "acc_norm": 0.27986348122866894, - "acc_norm_stderr": 0.013119040897725922 - }, - "sciq": { - "acc": 0.879, - "acc_stderr": 0.010318210380946088, - "acc_norm": 0.849, - "acc_norm_stderr": 0.011328165223341673 - }, - "piqa": { - "acc": 0.7431991294885746, - "acc_stderr": 0.010192864802278039, - "acc_norm": 0.7404787812840044, - "acc_norm_stderr": 0.010227939888173923 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_4.csv b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..a9c244940320414cd878a8950ff4d3a471b51f09 --- /dev/null +++ b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229857,0 +anli_r2,acc,0.344,0.015029633724408947,0 +anli_r3,acc,0.3275,0.013553211167251939,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378607,0 +arc_challenge,acc_norm,0.2901023890784983,0.01326157367752077,0 +arc_easy,acc,0.5896464646464646,0.01009353125576546,0 +arc_easy,acc_norm,0.5812289562289562,0.010123487160167817,0 +boolq,acc,0.5758409785932722,0.008643869023388128,1 +cb,acc,0.5178571428571429,0.06737697508644645,1 +cb,f1,0.343827671913836,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.43397729535949015,0.004946089230153028,0 +hellaswag,acc_norm,0.5644293965345548,0.004948181367024958,0 +piqa,acc,0.7437431991294886,0.01018578783156506,0 +piqa,acc_norm,0.7393906420021763,0.01024182615581163,0 +rte,acc,0.4729241877256318,0.030052303463143706,0 +sciq,acc,0.88,0.01028132801274739,0 +sciq,acc_norm,0.859,0.011010914595992443,0 +storycloze_2016,acc,0.694815606627472,0.010648664383985661,0 +winogrande,acc,0.5548539857932123,0.013967662954355486,0 diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index 9def61d5d511a5a00646ce0ce932360333abe235..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229857 - }, - "anli_r2": { - "acc": 0.344, - "acc_stderr": 0.015029633724408947 - }, - "anli_r3": { - "acc": 0.3275, - "acc_stderr": 0.013553211167251939 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644645, - "f1": 0.343827671913836 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.43397729535949015, - "acc_stderr": 0.004946089230153028, - "acc_norm": 0.5644293965345548, - "acc_norm_stderr": 0.004948181367024958 - }, - "rte": { - "acc": 0.4729241877256318, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.5548539857932123, - "acc_stderr": 0.013967662954355486 - }, - "storycloze_2016": { - "acc": 0.694815606627472, - "acc_stderr": 0.010648664383985661 - }, - "boolq": { - "acc": 0.5758409785932722, - "acc_stderr": 0.008643869023388128 - }, - "arc_easy": { - "acc": 0.5896464646464646, - "acc_stderr": 0.01009353125576546, - "acc_norm": 0.5812289562289562, - "acc_norm_stderr": 0.010123487160167817 - }, - "arc_challenge": { - "acc": 0.25341296928327645, - "acc_stderr": 0.012710896778378607, - "acc_norm": 0.2901023890784983, - "acc_norm_stderr": 0.01326157367752077 - }, - "sciq": { - "acc": 0.88, - "acc_stderr": 0.01028132801274739, - "acc_norm": 0.859, - "acc_norm_stderr": 0.011010914595992443 - }, - "piqa": { - "acc": 0.7437431991294886, - "acc_stderr": 0.01018578783156506, - "acc_norm": 0.7393906420021763, - "acc_norm_stderr": 0.01024182615581163 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_5.csv b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..41f152bb4657787752525de3a9054b289c43efde --- /dev/null +++ b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.345,0.015039986742055235,0 +anli_r2,acc,0.343,0.015019206922356953,0 +anli_r3,acc,0.3566666666666667,0.013833742805050722,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858112,0 +arc_challenge,acc_norm,0.2815699658703072,0.013143376735009014,0 +arc_easy,acc,0.5904882154882155,0.01009036816099006,0 +arc_easy,acc_norm,0.5761784511784511,0.01014000609521361,0 +boolq,acc,0.5853211009174312,0.0086167917789813,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.3492957746478873,,1 +copa,acc,0.77,0.04229525846816505,0 +hellaswag,acc,0.4326827325234017,0.004944351065545858,0 +hellaswag,acc_norm,0.5704043019318861,0.0049400674020310485,0 +piqa,acc,0.735038084874864,0.010296557993316044,0 +piqa,acc_norm,0.7421109902067464,0.010206956662056238,0 +rte,acc,0.49458483754512633,0.030094698123239966,0 +sciq,acc,0.884,0.010131468138756995,0 +sciq,acc_norm,0.875,0.010463483381956722,0 +storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0 +winogrande,acc,0.5335438042620363,0.014020826677598096,0 diff --git a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json b/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json deleted file mode 100644 index cfcac24a2cd3f53cbc2691a06f39e5cd8304a903..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed2/evaluation/rankeval/2b855b28bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.345, - "acc_stderr": 0.015039986742055235 - }, - "anli_r2": { - "acc": 0.343, - "acc_stderr": 0.015019206922356953 - }, - "anli_r3": { - "acc": 0.3566666666666667, - "acc_stderr": 0.013833742805050722 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.3492957746478873 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816505 - }, - "hellaswag": { - "acc": 0.4326827325234017, - "acc_stderr": 0.004944351065545858, - "acc_norm": 0.5704043019318861, - "acc_norm_stderr": 0.0049400674020310485 - }, - "rte": { - "acc": 0.49458483754512633, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5335438042620363, - "acc_stderr": 0.014020826677598096 - }, - "storycloze_2016": { - "acc": 0.6932121859967931, - "acc_stderr": 0.010664275190473634 - }, - "boolq": { - "acc": 0.5853211009174312, - "acc_stderr": 0.0086167917789813 - }, - "arc_easy": { - "acc": 0.5904882154882155, - "acc_stderr": 0.01009036816099006, - "acc_norm": 0.5761784511784511, - "acc_norm_stderr": 0.01014000609521361 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858112, - "acc_norm": 0.2815699658703072, - "acc_norm_stderr": 0.013143376735009014 - }, - "sciq": { - "acc": 0.884, - "acc_stderr": 0.010131468138756995, - "acc_norm": 0.875, - "acc_norm_stderr": 0.010463483381956722 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316044, - "acc_norm": 0.7421109902067464, - "acc_norm_stderr": 0.010206956662056238 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_0.csv b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..caf25fb779ddd6ca9708e75765bc3676d177f022 --- /dev/null +++ b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732963,0 +anli_r2,acc,0.339,0.014976758771620345,0 +anli_r3,acc,0.33916666666666667,0.013672343491681812,0 +arc_challenge,acc,0.24914675767918087,0.012639407111926432,0 +arc_challenge,acc_norm,0.2883959044368601,0.013238394422428171,0 +arc_easy,acc,0.5664983164983165,0.010168640625454107,0 +arc_easy,acc_norm,0.5071548821548821,0.010258733022446367,0 +boolq,acc,0.6036697247706422,0.00855501670654043,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.29572649572649573,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4402509460266879,0.004954026775425773,0 +hellaswag,acc_norm,0.5676160127464649,0.004943945069611464,0 +piqa,acc,0.735038084874864,0.010296557993316054,0 +piqa,acc_norm,0.7399347116430903,0.010234893249061272,0 +rte,acc,0.5523465703971119,0.029931070362939526,0 +sciq,acc,0.824,0.012048616898597512,0 +sciq,acc_norm,0.75,0.013699915608779773,0 +storycloze_2016,acc,0.6926777124532336,0.010669445081866662,0 +winogrande,acc,0.5737963693764798,0.013898585965412338,0 diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index 510edda38572ef2b5449911d4fb6f2695f879d5f..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.014976758771620345 - }, - "anli_r3": { - "acc": 0.33916666666666667, - "acc_stderr": 0.013672343491681812 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.29572649572649573 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4402509460266879, - "acc_stderr": 0.004954026775425773, - "acc_norm": 0.5676160127464649, - "acc_norm_stderr": 0.004943945069611464 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5737963693764798, - "acc_stderr": 0.013898585965412338 - }, - "storycloze_2016": { - "acc": 0.6926777124532336, - "acc_stderr": 0.010669445081866662 - }, - "boolq": { - "acc": 0.6036697247706422, - "acc_stderr": 0.00855501670654043 - }, - "arc_easy": { - "acc": 0.5664983164983165, - "acc_stderr": 0.010168640625454107, - "acc_norm": 0.5071548821548821, - "acc_norm_stderr": 0.010258733022446367 - }, - "arc_challenge": { - "acc": 0.24914675767918087, - "acc_stderr": 0.012639407111926432, - "acc_norm": 0.2883959044368601, - "acc_norm_stderr": 0.013238394422428171 - }, - "sciq": { - "acc": 0.824, - "acc_stderr": 0.012048616898597512, - "acc_norm": 0.75, - "acc_norm_stderr": 0.013699915608779773 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316054, - "acc_norm": 0.7399347116430903, - "acc_norm_stderr": 0.010234893249061272 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_1.csv b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..2c939032a1f41d677e881defdf5b1d4d72920f69 --- /dev/null +++ b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095526,0 +anli_r2,acc,0.332,0.01489959724281148,0 +anli_r3,acc,0.35583333333333333,0.013826518748493305,0 +arc_challenge,acc,0.2593856655290102,0.012808273573927099,0 +arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0 +arc_easy,acc,0.577020202020202,0.010137328382209094,0 +arc_easy,acc_norm,0.5429292929292929,0.010221897564256037,0 +boolq,acc,0.6131498470948012,0.008518188340844746,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3524590163934426,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.43507269468233417,0.004947533158712096,0 +hellaswag,acc_norm,0.5671181039633539,0.004944620712318274,0 +piqa,acc,0.7279651795429815,0.010382763786247381,0 +piqa,acc_norm,0.7328618063112078,0.010323440492612418,0 +rte,acc,0.47653429602888087,0.03006330041190266,0 +sciq,acc,0.867,0.010743669132397327,0 +sciq,acc_norm,0.845,0.011450157470799468,0 +storycloze_2016,acc,0.6809192944949225,0.010778970635312489,0 +winogrande,acc,0.5666929755327546,0.013926915052757347,0 diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index fd5ac142875e1bcbcac82fde5521ce550cebc64c..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.01489959724281148 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.013826518748493305 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3524590163934426 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.43507269468233417, - "acc_stderr": 0.004947533158712096, - "acc_norm": 0.5671181039633539, - "acc_norm_stderr": 0.004944620712318274 - }, - "rte": { - "acc": 0.47653429602888087, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5666929755327546, - "acc_stderr": 0.013926915052757347 - }, - "storycloze_2016": { - "acc": 0.6809192944949225, - "acc_stderr": 0.010778970635312489 - }, - "boolq": { - "acc": 0.6131498470948012, - "acc_stderr": 0.008518188340844746 - }, - "arc_easy": { - "acc": 0.577020202020202, - "acc_stderr": 0.010137328382209094, - "acc_norm": 0.5429292929292929, - "acc_norm_stderr": 0.010221897564256037 - }, - "arc_challenge": { - "acc": 0.2593856655290102, - "acc_stderr": 0.012808273573927099, - "acc_norm": 0.28924914675767915, - "acc_norm_stderr": 0.013250012579393443 - }, - "sciq": { - "acc": 0.867, - "acc_stderr": 0.010743669132397327, - "acc_norm": 0.845, - "acc_norm_stderr": 0.011450157470799468 - }, - "piqa": { - "acc": 0.7279651795429815, - "acc_stderr": 0.010382763786247381, - "acc_norm": 0.7328618063112078, - "acc_norm_stderr": 0.010323440492612418 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_2.csv b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..ae35cc04c56939109ad69047a18ab4e4a3da7f7e --- /dev/null +++ b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.014853842487270334,0 +anli_r2,acc,0.341,0.014998131348402697,0 +anli_r3,acc,0.32083333333333336,0.013480882752851553,0 +arc_challenge,acc,0.2551194539249147,0.012739038695202098,0 +arc_challenge,acc_norm,0.2815699658703072,0.013143376735009019,0 +arc_easy,acc,0.5871212121212122,0.010102837421104665,0 +arc_easy,acc_norm,0.5631313131313131,0.01017767292815769,0 +boolq,acc,0.6021406727828746,0.00856064116930337,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.23703703703703702,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.433877713602868,0.00494595674494381,0 +hellaswag,acc_norm,0.566620195180243,0.004945291270072431,0 +piqa,acc,0.7312295973884657,0.010343392940090011,0 +piqa,acc_norm,0.73449401523395,0.01030330865302443,0 +rte,acc,0.44404332129963897,0.029907396333795987,0 +sciq,acc,0.873,0.010534798620855748,0 +sciq,acc_norm,0.857,0.011075814808567038,0 +storycloze_2016,acc,0.6878674505611972,0.010715220346279685,0 +winogrande,acc,0.5674822415153907,0.013923911578623828,0 diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index 542db363705164750123f8a70fb7a97faaa9bee5..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.014853842487270334 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.014998131348402697 - }, - "anli_r3": { - "acc": 0.32083333333333336, - "acc_stderr": 0.013480882752851553 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.23703703703703702 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.433877713602868, - "acc_stderr": 0.00494595674494381, - "acc_norm": 0.566620195180243, - "acc_norm_stderr": 0.004945291270072431 - }, - "rte": { - "acc": 0.44404332129963897, - "acc_stderr": 0.029907396333795987 - }, - "winogrande": { - "acc": 0.5674822415153907, - "acc_stderr": 0.013923911578623828 - }, - "storycloze_2016": { - "acc": 0.6878674505611972, - "acc_stderr": 0.010715220346279685 - }, - "boolq": { - "acc": 0.6021406727828746, - "acc_stderr": 0.00856064116930337 - }, - "arc_easy": { - "acc": 0.5871212121212122, - "acc_stderr": 0.010102837421104665, - "acc_norm": 0.5631313131313131, - "acc_norm_stderr": 0.01017767292815769 - }, - "arc_challenge": { - "acc": 0.2551194539249147, - "acc_stderr": 0.012739038695202098, - "acc_norm": 0.2815699658703072, - "acc_norm_stderr": 0.013143376735009019 - }, - "sciq": { - "acc": 0.873, - "acc_stderr": 0.010534798620855748, - "acc_norm": 0.857, - "acc_norm_stderr": 0.011075814808567038 - }, - "piqa": { - "acc": 0.7312295973884657, - "acc_stderr": 0.010343392940090011, - "acc_norm": 0.73449401523395, - "acc_norm_stderr": 0.01030330865302443 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_3.csv b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..a504850edbe48bedfb73a317530b84784ad7e019 --- /dev/null +++ b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203934,0 +anli_r2,acc,0.348,0.01507060460376841,0 +anli_r3,acc,0.3425,0.013704669762934725,0 +arc_challenge,acc,0.26535836177474403,0.012902554762313966,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0 +arc_easy,acc,0.5888047138047138,0.010096663811817681,0 +arc_easy,acc_norm,0.5787037037037037,0.010131882498193127,0 +boolq,acc,0.6027522935779817,0.008558401855851156,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.355952380952381,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.4356701852220673,0.004948310399746082,0 +hellaswag,acc_norm,0.5717984465245967,0.004938068627349501,0 +piqa,acc,0.7290533188248096,0.010369718937426843,0 +piqa,acc_norm,0.7459194776931447,0.01015727199913505,0 +rte,acc,0.48375451263537905,0.030080573208738064,0 +sciq,acc,0.879,0.010318210380946092,0 +sciq,acc_norm,0.869,0.010674874844837956,0 +storycloze_2016,acc,0.6900053447354356,0.01069504280621255,0 +winogrande,acc,0.5564325177584846,0.013962694907620402,0 diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index d1058cc0ce72e01128aa822172f6fe36ebb15a0c..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203934 - }, - "anli_r2": { - "acc": 0.348, - "acc_stderr": 0.01507060460376841 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934725 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.355952380952381 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.4356701852220673, - "acc_stderr": 0.004948310399746082, - "acc_norm": 0.5717984465245967, - "acc_norm_stderr": 0.004938068627349501 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5564325177584846, - "acc_stderr": 0.013962694907620402 - }, - "storycloze_2016": { - "acc": 0.6900053447354356, - "acc_stderr": 0.01069504280621255 - }, - "boolq": { - "acc": 0.6027522935779817, - "acc_stderr": 0.008558401855851156 - }, - "arc_easy": { - "acc": 0.5888047138047138, - "acc_stderr": 0.010096663811817681, - "acc_norm": 0.5787037037037037, - "acc_norm_stderr": 0.010131882498193127 - }, - "arc_challenge": { - "acc": 0.26535836177474403, - "acc_stderr": 0.012902554762313966, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252425 - }, - "sciq": { - "acc": 0.879, - "acc_stderr": 0.010318210380946092, - "acc_norm": 0.869, - "acc_norm_stderr": 0.010674874844837956 - }, - "piqa": { - "acc": 0.7290533188248096, - "acc_stderr": 0.010369718937426843, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.01015727199913505 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_4.csv b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..b4b6940eeb4f6663937e1c372ec55c736b069c1a --- /dev/null +++ b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.377,0.015333170125779847,0 +anli_r2,acc,0.352,0.015110404505648673,0 +anli_r3,acc,0.3516666666666667,0.01378971169540479,0 +arc_challenge,acc,0.26706484641638223,0.012928933196496354,0 +arc_challenge,acc_norm,0.295221843003413,0.013329750293382316,0 +arc_easy,acc,0.5871212121212122,0.010102837421104665,0 +arc_easy,acc_norm,0.5787037037037037,0.010131882498193127,0 +boolq,acc,0.6033639143730887,0.008556148582031999,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.23543123543123545,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.43537143995220073,0.004947922692688842,0 +hellaswag,acc_norm,0.5713005377414858,0.004938787067611809,0 +piqa,acc,0.73449401523395,0.010303308653024429,0 +piqa,acc_norm,0.7459194776931447,0.01015727199913505,0 +rte,acc,0.44404332129963897,0.02990739633379598,0 +sciq,acc,0.891,0.009859828407037188,0 +sciq,acc_norm,0.877,0.010391293421849879,0 +storycloze_2016,acc,0.6974879743452699,0.010622307774396942,0 +winogrande,acc,0.5556432517758485,0.013965196769083555,0 diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json deleted file mode 100644 index 43b23422242c59a40ac795b168f4cd161206b542..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.377, - "acc_stderr": 0.015333170125779847 - }, - "anli_r2": { - "acc": 0.352, - "acc_stderr": 0.015110404505648673 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.01378971169540479 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.23543123543123545 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.43537143995220073, - "acc_stderr": 0.004947922692688842, - "acc_norm": 0.5713005377414858, - "acc_norm_stderr": 0.004938787067611809 - }, - "rte": { - "acc": 0.44404332129963897, - "acc_stderr": 0.02990739633379598 - }, - "winogrande": { - "acc": 0.5556432517758485, - "acc_stderr": 0.013965196769083555 - }, - "storycloze_2016": { - "acc": 0.6974879743452699, - "acc_stderr": 0.010622307774396942 - }, - "boolq": { - "acc": 0.6033639143730887, - "acc_stderr": 0.008556148582031999 - }, - "arc_easy": { - "acc": 0.5871212121212122, - "acc_stderr": 0.010102837421104665, - "acc_norm": 0.5787037037037037, - "acc_norm_stderr": 0.010131882498193127 - }, - "arc_challenge": { - "acc": 0.26706484641638223, - "acc_stderr": 0.012928933196496354, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.013329750293382316 - }, - "sciq": { - "acc": 0.891, - "acc_stderr": 0.009859828407037188, - "acc_norm": 0.877, - "acc_norm_stderr": 0.010391293421849879 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024429, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.01015727199913505 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_5.csv b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..c61d2e63ce1fe68cf0821c5b8cb71e162839150a --- /dev/null +++ b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.354,0.015129868238451773,0 +anli_r2,acc,0.35,0.01509065034144423,0 +anli_r3,acc,0.3308333333333333,0.013588208070709007,0 +arc_challenge,acc,0.2790102389078498,0.013106784883601336,0 +arc_challenge,acc_norm,0.30119453924914674,0.013406741767847626,0 +arc_easy,acc,0.5934343434343434,0.010079056419223527,0 +arc_easy,acc_norm,0.5820707070707071,0.010120628211017875,0 +boolq,acc,0.5948012232415902,0.008586427929715524,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.2695612844866576,,1 +copa,acc,0.76,0.042923469599092816,0 +hellaswag,acc,0.4366660027882892,0.004949589567678894,0 +hellaswag,acc_norm,0.574088826926907,0.00493469801205025,0 +piqa,acc,0.7301414581066377,0.010356595421852206,0 +piqa,acc_norm,0.7426550598476604,0.010199921064792509,0 +rte,acc,0.48736462093862815,0.030086851767188564,0 +sciq,acc,0.893,0.009779910359847165,0 +sciq,acc_norm,0.887,0.010016552866696862,0 +storycloze_2016,acc,0.6969535008017104,0.01062761307337672,0 +winogrande,acc,0.5493291239147593,0.013983928869040239,0 diff --git a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index 534d47faa3595fbd7fb4e87733ca2c04742af208..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed3/evaluation/rankeval/2b855b28bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.354, - "acc_stderr": 0.015129868238451773 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.01509065034144423 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709007 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.2695612844866576 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.042923469599092816 - }, - "hellaswag": { - "acc": 0.4366660027882892, - "acc_stderr": 0.004949589567678894, - "acc_norm": 0.574088826926907, - "acc_norm_stderr": 0.00493469801205025 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5493291239147593, - "acc_stderr": 0.013983928869040239 - }, - "storycloze_2016": { - "acc": 0.6969535008017104, - "acc_stderr": 0.01062761307337672 - }, - "boolq": { - "acc": 0.5948012232415902, - "acc_stderr": 0.008586427929715524 - }, - "arc_easy": { - "acc": 0.5934343434343434, - "acc_stderr": 0.010079056419223527, - "acc_norm": 0.5820707070707071, - "acc_norm_stderr": 0.010120628211017875 - }, - "arc_challenge": { - "acc": 0.2790102389078498, - "acc_stderr": 0.013106784883601336, - "acc_norm": 0.30119453924914674, - "acc_norm_stderr": 0.013406741767847626 - }, - "sciq": { - "acc": 0.893, - "acc_stderr": 0.009779910359847165, - "acc_norm": 0.887, - "acc_norm_stderr": 0.010016552866696862 - }, - "piqa": { - "acc": 0.7301414581066377, - "acc_stderr": 0.010356595421852206, - "acc_norm": 0.7426550598476604, - "acc_norm_stderr": 0.010199921064792509 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_0.csv b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..9fea1eec135ec5128a0667184f4bd9f7357ca780 --- /dev/null +++ b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795023,0 +anli_r2,acc,0.336,0.014944140233795028,0 +anli_r3,acc,0.355,0.013819249004047296,0 +arc_challenge,acc,0.25170648464163825,0.012682496334042961,0 +arc_challenge,acc_norm,0.26621160409556316,0.012915774781523214,0 +arc_easy,acc,0.5723905723905723,0.010151683397430677,0 +arc_easy,acc_norm,0.5084175084175084,0.010258329515226459,0 +boolq,acc,0.5571865443425077,0.008687668766930823,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.2798764552150155,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.44144592710615416,0.00495544756469405,0 +hellaswag,acc_norm,0.569806811392153,0.004940911779273381,0 +piqa,acc,0.73449401523395,0.010303308653024427,0 +piqa,acc_norm,0.7437431991294886,0.01018578783156505,0 +rte,acc,0.5848375451263538,0.02966006629089348,0 +sciq,acc,0.83,0.011884495834541677,0 +sciq,acc_norm,0.721,0.014190150117612028,0 +storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0 +winogrande,acc,0.5706393054459353,0.013911537499969174,0 diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json deleted file mode 100644 index 9b13743ba222ca56458aa036fc8c1c666dcc0451..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795023 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795028 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.013819249004047296 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.2798764552150155 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.44144592710615416, - "acc_stderr": 0.00495544756469405, - "acc_norm": 0.569806811392153, - "acc_norm_stderr": 0.004940911779273381 - }, - "rte": { - "acc": 0.5848375451263538, - "acc_stderr": 0.02966006629089348 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.013911537499969174 - }, - "storycloze_2016": { - "acc": 0.6932121859967931, - "acc_stderr": 0.010664275190473634 - }, - "boolq": { - "acc": 0.5571865443425077, - "acc_stderr": 0.008687668766930823 - }, - "arc_easy": { - "acc": 0.5723905723905723, - "acc_stderr": 0.010151683397430677, - "acc_norm": 0.5084175084175084, - "acc_norm_stderr": 0.010258329515226459 - }, - "arc_challenge": { - "acc": 0.25170648464163825, - "acc_stderr": 0.012682496334042961, - "acc_norm": 0.26621160409556316, - "acc_norm_stderr": 0.012915774781523214 - }, - "sciq": { - "acc": 0.83, - "acc_stderr": 0.011884495834541677, - "acc_norm": 0.721, - "acc_norm_stderr": 0.014190150117612028 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024427, - "acc_norm": 0.7437431991294886, - "acc_norm_stderr": 0.01018578783156505 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_1.csv b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..f5b3582bf8077392e438899bbaae04080dae84ac --- /dev/null +++ b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.337,0.014955087918653605,0 +anli_r2,acc,0.339,0.014976758771620344,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.2593856655290102,0.012808273573927099,0 +arc_challenge,acc_norm,0.2790102389078498,0.013106784883601333,0 +arc_easy,acc,0.5812289562289562,0.010123487160167807,0 +arc_easy,acc_norm,0.5467171717171717,0.010214901516731604,0 +boolq,acc,0.5688073394495413,0.008661853128165595,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.3299319727891156,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.4347739494124676,0.004947141797384121,0 +hellaswag,acc_norm,0.5640310695080661,0.004948696280312415,0 +piqa,acc,0.7372143634385201,0.010269354068140769,0 +piqa,acc_norm,0.7410228509249184,0.010220966031405621,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.859,0.011010914595992443,0 +sciq,acc_norm,0.834,0.011772110370812189,0 +storycloze_2016,acc,0.6873329770176376,0.010720223172953167,0 +winogrande,acc,0.5572217837411207,0.013960157350784978,0 diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index 93e3ddea39b47f8a155dd06db02828134cb9fc0d..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.337, - "acc_stderr": 0.014955087918653605 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.014976758771620344 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.3299319727891156 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.4347739494124676, - "acc_stderr": 0.004947141797384121, - "acc_norm": 0.5640310695080661, - "acc_norm_stderr": 0.004948696280312415 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.5572217837411207, - "acc_stderr": 0.013960157350784978 - }, - "storycloze_2016": { - "acc": 0.6873329770176376, - "acc_stderr": 0.010720223172953167 - }, - "boolq": { - "acc": 0.5688073394495413, - "acc_stderr": 0.008661853128165595 - }, - "arc_easy": { - "acc": 0.5812289562289562, - "acc_stderr": 0.010123487160167807, - "acc_norm": 0.5467171717171717, - "acc_norm_stderr": 0.010214901516731604 - }, - "arc_challenge": { - "acc": 0.2593856655290102, - "acc_stderr": 0.012808273573927099, - "acc_norm": 0.2790102389078498, - "acc_norm_stderr": 0.013106784883601333 - }, - "sciq": { - "acc": 0.859, - "acc_stderr": 0.011010914595992443, - "acc_norm": 0.834, - "acc_norm_stderr": 0.011772110370812189 - }, - "piqa": { - "acc": 0.7372143634385201, - "acc_stderr": 0.010269354068140769, - "acc_norm": 0.7410228509249184, - "acc_norm_stderr": 0.010220966031405621 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_2.csv b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..97598c33babfb90395303ae803ad0dfafae10666 --- /dev/null +++ b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.349,0.0150806639915631,0 +anli_r2,acc,0.341,0.0149981313484027,0 +anli_r3,acc,0.3258333333333333,0.01353542204341747,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.28668941979522183,0.013214986329274763,0 +arc_easy,acc,0.5909090909090909,0.010088775152615788,0 +arc_easy,acc_norm,0.585016835016835,0.010110383151961137,0 +boolq,acc,0.5923547400611621,0.008594580270731612,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.24464600655076843,,1 +copa,acc,0.69,0.04648231987117316,0 +hellaswag,acc,0.43636725751842265,0.004949207947265912,0 +hellaswag,acc_norm,0.5658235411272655,0.00494635359093702,0 +piqa,acc,0.735038084874864,0.01029655799331605,0 +piqa,acc_norm,0.7383025027203483,0.01025563077270823,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.883,0.01016928780271333,0 +sciq,acc_norm,0.861,0.010945263761042965,0 +storycloze_2016,acc,0.6825227151256013,0.010764505409830935,0 +winogrande,acc,0.5674822415153907,0.013923911578623833,0 diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index 57d62b71a773a1e40d18f92da67f7fcd608d5ce1..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.349, - "acc_stderr": 0.0150806639915631 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.0149981313484027 - }, - "anli_r3": { - "acc": 0.3258333333333333, - "acc_stderr": 0.01353542204341747 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.24464600655076843 - }, - "copa": { - "acc": 0.69, - "acc_stderr": 0.04648231987117316 - }, - "hellaswag": { - "acc": 0.43636725751842265, - "acc_stderr": 0.004949207947265912, - "acc_norm": 0.5658235411272655, - "acc_norm_stderr": 0.00494635359093702 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5674822415153907, - "acc_stderr": 0.013923911578623833 - }, - "storycloze_2016": { - "acc": 0.6825227151256013, - "acc_stderr": 0.010764505409830935 - }, - "boolq": { - "acc": 0.5923547400611621, - "acc_stderr": 0.008594580270731612 - }, - "arc_easy": { - "acc": 0.5909090909090909, - "acc_stderr": 0.010088775152615788, - "acc_norm": 0.585016835016835, - "acc_norm_stderr": 0.010110383151961137 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.013214986329274763 - }, - "sciq": { - "acc": 0.883, - "acc_stderr": 0.01016928780271333, - "acc_norm": 0.861, - "acc_norm_stderr": 0.010945263761042965 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.01029655799331605, - "acc_norm": 0.7383025027203483, - "acc_norm_stderr": 0.01025563077270823 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_3.csv b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..ee7ba132216da0ee843348ffaaadfff5ff7da7cc --- /dev/null +++ b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121731,0 +anli_r2,acc,0.353,0.01512017260548369,0 +anli_r3,acc,0.34,0.013680495725767787,0 +arc_challenge,acc,0.2593856655290102,0.012808273573927099,0 +arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0 +arc_easy,acc,0.5989057239057239,0.01005705110653437,0 +arc_easy,acc_norm,0.5871212121212122,0.010102837421104668,0 +boolq,acc,0.5957186544342508,0.008583313811372069,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.3828354466652339,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.43377813184624575,0.004945824056501812,0 +hellaswag,acc_norm,0.5705038836885082,0.004939925958728881,0 +piqa,acc,0.7393906420021763,0.010241826155811627,0 +piqa,acc_norm,0.7410228509249184,0.010220966031405616,0 +rte,acc,0.555956678700361,0.029907396333795994,0 +sciq,acc,0.892,0.009820001651345688,0 +sciq,acc_norm,0.875,0.010463483381956722,0 +storycloze_2016,acc,0.6889363976483164,0.010705164869803167,0 +winogrande,acc,0.55327545382794,0.013972488371616692,0 diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json deleted file mode 100644 index f15f4eb99b027735ffb8ce2f68e79f6856fc4ca6..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_3_lm-eval_global_step52452_2023-02-13-10-25-19_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r2": { - "acc": 0.353, - "acc_stderr": 0.01512017260548369 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767787 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.3828354466652339 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.43377813184624575, - "acc_stderr": 0.004945824056501812, - "acc_norm": 0.5705038836885082, - "acc_norm_stderr": 0.004939925958728881 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795994 - }, - "winogrande": { - "acc": 0.55327545382794, - "acc_stderr": 0.013972488371616692 - }, - "storycloze_2016": { - "acc": 0.6889363976483164, - "acc_stderr": 0.010705164869803167 - }, - "boolq": { - "acc": 0.5957186544342508, - "acc_stderr": 0.008583313811372069 - }, - "arc_easy": { - "acc": 0.5989057239057239, - "acc_stderr": 0.01005705110653437, - "acc_norm": 0.5871212121212122, - "acc_norm_stderr": 0.010102837421104668 - }, - "arc_challenge": { - "acc": 0.2593856655290102, - "acc_stderr": 0.012808273573927099, - "acc_norm": 0.2841296928327645, - "acc_norm_stderr": 0.013179442447653886 - }, - "sciq": { - "acc": 0.892, - "acc_stderr": 0.009820001651345688, - "acc_norm": 0.875, - "acc_norm_stderr": 0.010463483381956722 - }, - "piqa": { - "acc": 0.7393906420021763, - "acc_stderr": 0.010241826155811627, - "acc_norm": 0.7410228509249184, - "acc_norm_stderr": 0.010220966031405616 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_4.csv b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..f6802d793c7368d6dcfe5cb36498e54981fc30d7 --- /dev/null +++ b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.36,0.01518652793204012,0 +anli_r2,acc,0.374,0.015308767369006361,0 +anli_r3,acc,0.35083333333333333,0.013782212417178193,0 +arc_challenge,acc,0.257679180887372,0.0127807705627684,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0 +arc_easy,acc,0.5997474747474747,0.01005355011989613,0 +arc_easy,acc_norm,0.5824915824915825,0.01011918737777604,0 +boolq,acc,0.5914373088685015,0.008597580502718664,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.3044335645314837,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4364668392750448,0.004949335356881864,0 +hellaswag,acc_norm,0.5702051384186417,0.004940349676769331,0 +piqa,acc,0.7372143634385201,0.010269354068140769,0 +piqa,acc_norm,0.7437431991294886,0.010185787831565058,0 +rte,acc,0.48375451263537905,0.030080573208738064,0 +sciq,acc,0.891,0.009859828407037191,0 +sciq,acc_norm,0.876,0.01042749887234396,0 +storycloze_2016,acc,0.6905398182789952,0.01068995674518907,0 +winogrande,acc,0.5477505919494869,0.013988256216606014,0 diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index 56634c0ad842a53684998b72537fd1a7dab50328..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.36, - "acc_stderr": 0.01518652793204012 - }, - "anli_r2": { - "acc": 0.374, - "acc_stderr": 0.015308767369006361 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178193 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.3044335645314837 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4364668392750448, - "acc_stderr": 0.004949335356881864, - "acc_norm": 0.5702051384186417, - "acc_norm_stderr": 0.004940349676769331 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5477505919494869, - "acc_stderr": 0.013988256216606014 - }, - "storycloze_2016": { - "acc": 0.6905398182789952, - "acc_stderr": 0.01068995674518907 - }, - "boolq": { - "acc": 0.5914373088685015, - "acc_stderr": 0.008597580502718664 - }, - "arc_easy": { - "acc": 0.5997474747474747, - "acc_stderr": 0.01005355011989613, - "acc_norm": 0.5824915824915825, - "acc_norm_stderr": 0.01011918737777604 - }, - "arc_challenge": { - "acc": 0.257679180887372, - "acc_stderr": 0.0127807705627684, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097222 - }, - "sciq": { - "acc": 0.891, - "acc_stderr": 0.009859828407037191, - "acc_norm": 0.876, - "acc_norm_stderr": 0.01042749887234396 - }, - "piqa": { - "acc": 0.7372143634385201, - "acc_stderr": 0.010269354068140769, - "acc_norm": 0.7437431991294886, - "acc_norm_stderr": 0.010185787831565058 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_5.csv b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..3e8f6f1a8ffa5fedcba05abefc323e3fe19f81bb --- /dev/null +++ b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.353,0.01512017260548369,0 +anli_r2,acc,0.356,0.015149042659306623,0 +anli_r3,acc,0.33166666666666667,0.013596836729485168,0 +arc_challenge,acc,0.2627986348122867,0.012862523175351331,0 +arc_challenge,acc_norm,0.28668941979522183,0.013214986329274767,0 +arc_easy,acc,0.5997474747474747,0.010053550119896127,0 +arc_easy,acc_norm,0.5854377104377104,0.010108889212447791,0 +boolq,acc,0.6012232415902141,0.008563973987729907,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.33751698259688173,,1 +copa,acc,0.71,0.04560480215720684,0 +hellaswag,acc,0.43596893049193386,0.004948696280312426,0 +hellaswag,acc_norm,0.5710017924716192,0.0049392156821917695,0 +piqa,acc,0.7366702937976061,0.010276185322196764,0 +piqa,acc_norm,0.7415669205658324,0.010213971636773313,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.899,0.009533618929340985,0 +sciq,acc_norm,0.896,0.009658016218524293,0 +storycloze_2016,acc,0.6953500801710315,0.0106434269886468,0 +winogrande,acc,0.5564325177584846,0.013962694907620404,0 diff --git a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index de195a99ac2e23f260e0e32a891a49f7696d4ac8..0000000000000000000000000000000000000000 --- a/2b855b28bc4seed4/evaluation/rankeval/2b855b28bc4seed4_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.353, - "acc_stderr": 0.01512017260548369 - }, - "anli_r2": { - "acc": 0.356, - "acc_stderr": 0.015149042659306623 - }, - "anli_r3": { - "acc": 0.33166666666666667, - "acc_stderr": 0.013596836729485168 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.33751698259688173 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.04560480215720684 - }, - "hellaswag": { - "acc": 0.43596893049193386, - "acc_stderr": 0.004948696280312426, - "acc_norm": 0.5710017924716192, - "acc_norm_stderr": 0.0049392156821917695 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5564325177584846, - "acc_stderr": 0.013962694907620404 - }, - "storycloze_2016": { - "acc": 0.6953500801710315, - "acc_stderr": 0.0106434269886468 - }, - "boolq": { - "acc": 0.6012232415902141, - "acc_stderr": 0.008563973987729907 - }, - "arc_easy": { - "acc": 0.5997474747474747, - "acc_stderr": 0.010053550119896127, - "acc_norm": 0.5854377104377104, - "acc_norm_stderr": 0.010108889212447791 - }, - "arc_challenge": { - "acc": 0.2627986348122867, - "acc_stderr": 0.012862523175351331, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.013214986329274767 - }, - "sciq": { - "acc": 0.899, - "acc_stderr": 0.009533618929340985, - "acc_norm": 0.896, - "acc_norm_stderr": 0.009658016218524293 - }, - "piqa": { - "acc": 0.7366702937976061, - "acc_stderr": 0.010276185322196764, - "acc_norm": 0.7415669205658324, - "acc_norm_stderr": 0.010213971636773313 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/generation/merged.csv b/2b855b4bc4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..5311eb3bd9739aaa3a6f134499b1828a4b6f4f84 --- /dev/null +++ b/2b855b4bc4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0781272317594066 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0781272317594066 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.14751847292773587 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.14751847292773587 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.16331509152950488 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16331509152950488 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.16584161893779908 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.16584161893779908 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1702076056023139 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1702076056023139 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1738222237468194 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1738222237468194 +e2e_nlg_cleaned,5,average,multiple,0.14980537408392997 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.042153838410666736 +gem_xsum,0,median,rouge2_fmeasure,0.042153838410666736 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.024555822633540537 +gem_xsum,1,median,rouge2_fmeasure,0.024555822633540537 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.023587313413704444 +gem_xsum,2,median,rouge2_fmeasure,0.023587313413704444 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.02358333545532719 +gem_xsum,3,median,rouge2_fmeasure,0.02358333545532719 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.006195379439697378 +gem_xsum,4,median,rouge2_fmeasure,0.006195379439697378 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001287874039080709 +gem_xsum,5,median,rouge2_fmeasure,0.0001287874039080709 +gem_xsum,5,average,multiple,0.02003407945947406 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.047349099388232035 +web_nlg_en,0,median,rouge2_fmeasure,0.047349099388232035 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.04487032485588369 +web_nlg_en,1,median,rouge2_fmeasure,0.04487032485588369 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.04634376242269252 +web_nlg_en,2,median,rouge2_fmeasure,0.04634376242269252 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.04503121538891503 +web_nlg_en,3,median,rouge2_fmeasure,0.04503121538891503 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.047200493334113615 +web_nlg_en,4,median,rouge2_fmeasure,0.047200493334113615 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.04547317077887062 +web_nlg_en,5,median,rouge2_fmeasure,0.04547317077887062 +web_nlg_en,5,average,multiple,0.04604467769478458 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03177170959182661 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03177170959182661 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.035673976193771974 +wiki_lingua_en,1,median,rouge2_fmeasure,0.035673976193771974 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03838403178488669 +wiki_lingua_en,2,median,rouge2_fmeasure,0.03838403178488669 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.032809011988041384 +wiki_lingua_en,3,median,rouge2_fmeasure,0.032809011988041384 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.010744972904987648 +wiki_lingua_en,4,median,rouge2_fmeasure,0.010744972904987648 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0016579937024865626 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0016579937024865626 +wiki_lingua_en,5,average,multiple,0.025173616027666813 diff --git a/2b855b4bc4seed1/evaluation/generation/merged.json b/2b855b4bc4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..625826acf389f59d2a92c518e99364ab408ccace --- /dev/null +++ b/2b855b4bc4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.39855658833240104, "bleu_stderr": 0.026569783418037774, "rouge1_fmeasure": 0.10116791036204026, "rouge1_fmeasure_stderr": 0.002146154058076065, "rouge1_precision": 0.0681383451903841, "rouge1_precision_stderr": 0.0017459951545863855, "rouge1_recall": 0.2868586326665418, "rouge1_recall_stderr": 0.004824268129831654, "rouge2_fmeasure": 0.047349099388232035, "rouge2_fmeasure_stderr": 0.0013284747595520357, "rouge2_precision": 0.03126302205288469, "rouge2_precision_stderr": 0.0009850036254092948, "rouge2_recall": 0.13556136849826403, "rouge2_recall_stderr": 0.0032166746311611994, "rougeL_fmeasure": 0.09604330456797407, "rougeL_fmeasure_stderr": 0.0019713714851164322, "rougeL_precision": 0.06437528461062834, "rougeL_precision_stderr": 0.0015957361187538223, "rougeL_recall": 0.2753959775705074, "rougeL_recall_stderr": 0.00466857019137397, "rougeLsum_fmeasure": 0.09560406363234744, "rougeLsum_fmeasure_stderr": 0.0020066099973174915, "rougeLsum_precision": 0.06439853493257244, "rougeLsum_precision_stderr": 0.0016465025871715334, "rougeLsum_recall": 0.27133410145709913, "rougeLsum_recall_stderr": 0.004513671060323518}}, "1": {"PALM_prompt": {"bleu": 0.3912276412395352, "bleu_stderr": 0.020484250080896618, "rouge1_fmeasure": 0.1006872191190483, "rouge1_fmeasure_stderr": 0.001920986262572305, "rouge1_precision": 0.06518931479471321, "rouge1_precision_stderr": 0.0014967694916187414, "rouge1_recall": 0.33171280095627653, "rouge1_recall_stderr": 0.00514509486146305, "rouge2_fmeasure": 0.04487032485588369, "rouge2_fmeasure_stderr": 0.0011904326422867075, "rouge2_precision": 0.02909822469430401, "rouge2_precision_stderr": 0.0008998133580491436, "rouge2_recall": 0.1502346730077987, "rouge2_recall_stderr": 0.0033848523651528895, "rougeL_fmeasure": 0.09361543338651233, "rougeL_fmeasure_stderr": 0.0017548257333492123, "rougeL_precision": 0.06065875807260051, "rougeL_precision_stderr": 0.0013716176596965028, "rougeL_recall": 0.3059288650916899, "rougeL_recall_stderr": 0.004623721258942203, "rougeLsum_fmeasure": 0.09537017930398344, "rougeLsum_fmeasure_stderr": 0.001823579905497483, "rougeLsum_precision": 0.06183903916740186, "rougeLsum_precision_stderr": 0.0014278699660441308, "rougeLsum_recall": 0.3121661527895801, "rougeLsum_recall_stderr": 0.004757283098586568}}, "2": {"PALM_prompt": {"bleu": 0.3943262562299206, "bleu_stderr": 0.018261926096308043, "rouge1_fmeasure": 0.10142763892550687, "rouge1_fmeasure_stderr": 0.0018695705948765919, "rouge1_precision": 0.06568090565271803, "rouge1_precision_stderr": 0.0014593667843626683, "rouge1_recall": 0.33176044666345866, "rouge1_recall_stderr": 0.0049760255757141485, "rouge2_fmeasure": 0.04634376242269252, "rouge2_fmeasure_stderr": 0.0011703869000957733, "rouge2_precision": 0.030064238556505973, "rouge2_precision_stderr": 0.000878026151022135, "rouge2_recall": 0.15674156008610293, "rouge2_recall_stderr": 0.0034371591659074687, "rougeL_fmeasure": 0.09375576998957523, "rougeL_fmeasure_stderr": 0.00171329522247045, "rougeL_precision": 0.06073340922239521, "rougeL_precision_stderr": 0.0013391382988114195, "rougeL_recall": 0.30301799282679526, "rougeL_recall_stderr": 0.004405838361403633, "rougeLsum_fmeasure": 0.0960005452415318, "rougeLsum_fmeasure_stderr": 0.0017713740893331342, "rougeLsum_precision": 0.06222811774824065, "rougeLsum_precision_stderr": 0.001389419033401118, "rougeLsum_recall": 0.3122289734770347, "rougeLsum_recall_stderr": 0.004608484270680722}}, "3": {"PALM_prompt": {"bleu": 0.42220231954083376, "bleu_stderr": 0.02023512696975793, "rouge1_fmeasure": 0.09852727869171284, "rouge1_fmeasure_stderr": 0.0018203807487138662, "rouge1_precision": 0.06294424704272412, "rouge1_precision_stderr": 0.0013426038293432201, "rouge1_recall": 0.32828293290150784, "rouge1_recall_stderr": 0.005001939985910281, "rouge2_fmeasure": 0.04503121538891503, "rouge2_fmeasure_stderr": 0.0011459084477198273, "rouge2_precision": 0.028779067635461713, "rouge2_precision_stderr": 0.0008198408986678626, "rouge2_recall": 0.15496999960360197, "rouge2_recall_stderr": 0.0034216215513968302, "rougeL_fmeasure": 0.09052242409400411, "rougeL_fmeasure_stderr": 0.0016574959187572178, "rougeL_precision": 0.05785040573096921, "rougeL_precision_stderr": 0.0012115439751522847, "rougeL_recall": 0.2983337805760926, "rougeL_recall_stderr": 0.004398723198697165, "rougeLsum_fmeasure": 0.09294906918300895, "rougeLsum_fmeasure_stderr": 0.001716261400806889, "rougeLsum_precision": 0.0594396242004035, "rougeLsum_precision_stderr": 0.00126453568792367, "rougeLsum_recall": 0.3080884282709691, "rougeLsum_recall_stderr": 0.004608543541528767}}, "4": {"PALM_prompt": {"bleu": 0.4832210304652205, "bleu_stderr": 0.03147187969017503, "rouge1_fmeasure": 0.10154392419352416, "rouge1_fmeasure_stderr": 0.001855573131248702, "rouge1_precision": 0.06496573154327573, "rouge1_precision_stderr": 0.0014040449532895783, "rouge1_recall": 0.34055461469310166, "rouge1_recall_stderr": 0.0050133457039775635, "rouge2_fmeasure": 0.047200493334113615, "rouge2_fmeasure_stderr": 0.0011802748324519364, "rouge2_precision": 0.030167551913451496, "rouge2_precision_stderr": 0.0008561785653855975, "rouge2_recall": 0.1649962512635775, "rouge2_recall_stderr": 0.003584101763297551, "rougeL_fmeasure": 0.09311774371977341, "rougeL_fmeasure_stderr": 0.0016746907773452203, "rougeL_precision": 0.05953514643920731, "rougeL_precision_stderr": 0.0012430023753300773, "rougeL_recall": 0.3091164943758439, "rougeL_recall_stderr": 0.004419039964894518, "rougeLsum_fmeasure": 0.09607749936473489, "rougeLsum_fmeasure_stderr": 0.0017621551957160131, "rougeLsum_precision": 0.061471473785615884, "rougeLsum_precision_stderr": 0.0013159018012368767, "rougeLsum_recall": 0.32032521201557734, "rougeLsum_recall_stderr": 0.004674150509866559}}, "5": {"PALM_prompt": {"bleu": 0.45737528578894, "bleu_stderr": 0.02223362364269596, "rouge1_fmeasure": 0.09991060215055379, "rouge1_fmeasure_stderr": 0.0017580601671848262, "rouge1_precision": 0.06341236195807602, "rouge1_precision_stderr": 0.0013025059451882081, "rouge1_recall": 0.3444381235432288, "rouge1_recall_stderr": 0.005100649283959647, "rouge2_fmeasure": 0.04547317077887062, "rouge2_fmeasure_stderr": 0.001083447465046616, "rouge2_precision": 0.028780682477283995, "rouge2_precision_stderr": 0.0007717780565479907, "rouge2_recall": 0.16550143783892868, "rouge2_recall_stderr": 0.003572409556510635, "rougeL_fmeasure": 0.09108818436943181, "rougeL_fmeasure_stderr": 0.0015903792470751198, "rougeL_precision": 0.0578485176298394, "rougeL_precision_stderr": 0.0011713192716163545, "rougeL_recall": 0.311243979779327, "rougeL_recall_stderr": 0.004489723890330991, "rougeLsum_fmeasure": 0.09396070402136288, "rougeLsum_fmeasure_stderr": 0.0016577593277067773, "rougeLsum_precision": 0.0597173200997438, "rougeLsum_precision_stderr": 0.0012283187843817414, "rougeLsum_recall": 0.3220388574311313, "rougeLsum_recall_stderr": 0.004693107327532801}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.3690100977423922, "bleu_stderr": 0.05886941818468516, "rouge1_fmeasure": 0.1694168893882774, "rouge1_fmeasure_stderr": 0.0017285872978047574, "rouge1_precision": 0.14511944435822713, "rouge1_precision_stderr": 0.0017739222195375815, "rouge1_recall": 0.24520002842853666, "rouge1_recall_stderr": 0.0024767922959903223, "rouge2_fmeasure": 0.03177170959182661, "rouge2_fmeasure_stderr": 0.0007692709280843934, "rouge2_precision": 0.026968037085841123, "rouge2_precision_stderr": 0.0006724263803688037, "rouge2_recall": 0.04727287142494576, "rouge2_recall_stderr": 0.0012840793304952297, "rougeL_fmeasure": 0.13367149075100937, "rougeL_fmeasure_stderr": 0.00122579559557816, "rougeL_precision": 0.11308839561178641, "rougeL_precision_stderr": 0.0012317612395230065, "rougeL_recall": 0.19827948573980483, "rougeL_recall_stderr": 0.0020047077764803736, "rougeLsum_fmeasure": 0.15592593082738226, "rougeLsum_fmeasure_stderr": 0.0015743663299313405, "rougeLsum_precision": 0.13330979457859984, "rougeLsum_precision_stderr": 0.0016108665517001446, "rougeLsum_recall": 0.226523165276769, "rougeLsum_recall_stderr": 0.002292830009490771}}, "1": {"tldr_en": {"bleu": 1.7868385249352625, "bleu_stderr": 0.06643098164400961, "rouge1_fmeasure": 0.1815429849832861, "rouge1_fmeasure_stderr": 0.0018339982664320788, "rouge1_precision": 0.1563220314981603, "rouge1_precision_stderr": 0.0019288538487507313, "rouge1_recall": 0.26325355614186413, "rouge1_recall_stderr": 0.002686994036630168, "rouge2_fmeasure": 0.035673976193771974, "rouge2_fmeasure_stderr": 0.0008626117278733884, "rouge2_precision": 0.030543605372666018, "rouge2_precision_stderr": 0.0007754152670159024, "rouge2_recall": 0.05363235422287617, "rouge2_recall_stderr": 0.0014749147033489768, "rougeL_fmeasure": 0.13186989175022087, "rougeL_fmeasure_stderr": 0.0012587743127241181, "rougeL_precision": 0.11249260697257467, "rougeL_precision_stderr": 0.0013266232430102073, "rougeL_recall": 0.19600160136173014, "rougeL_recall_stderr": 0.0021032634815257457, "rougeLsum_fmeasure": 0.17088095619861712, "rougeLsum_fmeasure_stderr": 0.0017110735494681845, "rougeLsum_precision": 0.14699158833792147, "rougeLsum_precision_stderr": 0.0018030118143166547, "rougeLsum_recall": 0.24830570689715825, "rougeLsum_recall_stderr": 0.0025264893684192524}}, "2": {"tldr_en": {"bleu": 1.8963745828853016, "bleu_stderr": 0.08021880799057668, "rouge1_fmeasure": 0.18747800716688756, "rouge1_fmeasure_stderr": 0.001846937162893923, "rouge1_precision": 0.1637259808560354, "rouge1_precision_stderr": 0.002017053083347354, "rouge1_recall": 0.2675645692770856, "rouge1_recall_stderr": 0.002629826305543504, "rouge2_fmeasure": 0.03838403178488669, "rouge2_fmeasure_stderr": 0.0008793468371463138, "rouge2_precision": 0.03357198574623862, "rouge2_precision_stderr": 0.0008471590504777416, "rouge2_recall": 0.05670287996064233, "rouge2_recall_stderr": 0.0014609581024957142, "rougeL_fmeasure": 0.13884152020924362, "rougeL_fmeasure_stderr": 0.0012910258507251103, "rougeL_precision": 0.12020443998886943, "rougeL_precision_stderr": 0.0014203050912885904, "rougeL_recall": 0.20283299498036392, "rougeL_recall_stderr": 0.002086790453159732, "rougeLsum_fmeasure": 0.17614109076386195, "rougeLsum_fmeasure_stderr": 0.001727388356327573, "rougeLsum_precision": 0.15360087717039728, "rougeLsum_precision_stderr": 0.0018828689588982353, "rougeLsum_recall": 0.2523571093220897, "rougeLsum_recall_stderr": 0.0025102266090624974}}, "3": {"tldr_en": {"bleu": 1.9646254567397161, "bleu_stderr": 0.057075491568922325, "rouge1_fmeasure": 0.15614634812221673, "rouge1_fmeasure_stderr": 0.0020703756716238682, "rouge1_precision": 0.14301510545517512, "rouge1_precision_stderr": 0.002309076005710306, "rouge1_recall": 0.22154076255407457, "rouge1_recall_stderr": 0.003010516998702038, "rouge2_fmeasure": 0.032809011988041384, "rouge2_fmeasure_stderr": 0.0008454225326652912, "rouge2_precision": 0.02998405021751007, "rouge2_precision_stderr": 0.0009213928393943472, "rouge2_recall": 0.048768948166259624, "rouge2_recall_stderr": 0.0014429425871296203, "rougeL_fmeasure": 0.11710781259448001, "rougeL_fmeasure_stderr": 0.001486425944407185, "rougeL_precision": 0.10714061695488505, "rougeL_precision_stderr": 0.0017291510979768427, "rougeL_recall": 0.16946810310877372, "rougeL_recall_stderr": 0.0023670017930872435, "rougeLsum_fmeasure": 0.14700806948565207, "rougeLsum_fmeasure_stderr": 0.001940930721840521, "rougeLsum_precision": 0.13457044625902037, "rougeLsum_precision_stderr": 0.0021679948212344493, "rougeLsum_recall": 0.20912294885872634, "rougeLsum_recall_stderr": 0.002851560713931268}}, "4": {"tldr_en": {"bleu": 0.44123714013872695, "bleu_stderr": 0.03941198298833912, "rouge1_fmeasure": 0.05190882238695101, "rouge1_fmeasure_stderr": 0.0017704568053027153, "rouge1_precision": 0.04839149402820543, "rouge1_precision_stderr": 0.0018154126914704599, "rouge1_recall": 0.0763797139827709, "rouge1_recall_stderr": 0.002687114915047913, "rouge2_fmeasure": 0.010744972904987648, "rouge2_fmeasure_stderr": 0.0005635089537785232, "rouge2_precision": 0.00961923494025731, "rouge2_precision_stderr": 0.0005313682610078681, "rouge2_recall": 0.01695713661091346, "rouge2_recall_stderr": 0.001004848518846823, "rougeL_fmeasure": 0.0397938566579677, "rougeL_fmeasure_stderr": 0.0013337234042402404, "rougeL_precision": 0.037150316884056823, "rougeL_precision_stderr": 0.0013840484731482293, "rougeL_recall": 0.059558806955544184, "rougeL_recall_stderr": 0.0021226325282649, "rougeLsum_fmeasure": 0.04847441961045556, "rougeLsum_fmeasure_stderr": 0.0016506599915437933, "rougeLsum_precision": 0.04512978782545739, "rougeLsum_precision_stderr": 0.001692850076284173, "rougeLsum_recall": 0.07140313383359884, "rougeLsum_recall_stderr": 0.002513815424881597}}, "5": {"tldr_en": {"bleu": 9.820919733542687e-07, "bleu_stderr": 1.8968835689959146e-06, "rouge1_fmeasure": 0.00845254383401664, "rouge1_fmeasure_stderr": 0.0007874363081928194, "rouge1_precision": 0.008126763474813043, "rouge1_precision_stderr": 0.0008143969920360195, "rouge1_recall": 0.012724062506916302, "rouge1_recall_stderr": 0.0012549853819996692, "rouge2_fmeasure": 0.0016579937024865626, "rouge2_fmeasure_stderr": 0.00021772158325319047, "rouge2_precision": 0.001404947916671783, "rouge2_precision_stderr": 0.0001884209372573544, "rouge2_recall": 0.0029225050800347897, "rouge2_recall_stderr": 0.00046662061601198307, "rougeL_fmeasure": 0.006710885785199153, "rougeL_fmeasure_stderr": 0.000617445059108658, "rougeL_precision": 0.0065259191593632915, "rougeL_precision_stderr": 0.0006573529238346043, "rougeL_recall": 0.010258170625213047, "rougeL_recall_stderr": 0.0010284762210769883, "rougeLsum_fmeasure": 0.007972401310488701, "rougeLsum_fmeasure_stderr": 0.0007381135864703859, "rougeLsum_precision": 0.0076454293176619215, "rougeLsum_precision_stderr": 0.0007610127888008051, "rougeLsum_recall": 0.012131216596331538, "rougeLsum_recall_stderr": 0.001202490112618103}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 4.129757228038615, "bleu_stderr": 0.07014233813245054, "rouge1_fmeasure": 0.1963023731197804, "rouge1_fmeasure_stderr": 0.0021177845411018744, "rouge1_precision": 0.1549243323143822, "rouge1_precision_stderr": 0.0017710465053136317, "rouge1_recall": 0.28505359248537765, "rouge1_recall_stderr": 0.0030240906814134943, "rouge2_fmeasure": 0.0781272317594066, "rouge2_fmeasure_stderr": 0.0011108678646529282, "rouge2_precision": 0.06094285656359399, "rouge2_precision_stderr": 0.0008923016329885139, "rouge2_recall": 0.11574793975173864, "rouge2_recall_stderr": 0.001682823395366983, "rougeL_fmeasure": 0.1753767591966602, "rougeL_fmeasure_stderr": 0.0018343868630799004, "rougeL_precision": 0.1380786735865104, "rougeL_precision_stderr": 0.001521766772484174, "rougeL_recall": 0.2560687584672421, "rougeL_recall_stderr": 0.002688080710851207, "rougeLsum_fmeasure": 0.17207329998104542, "rougeLsum_fmeasure_stderr": 0.0019361501401427212, "rougeLsum_precision": 0.13565071522521033, "rougeLsum_precision_stderr": 0.0016036406101874248, "rougeLsum_recall": 0.25048383475986313, "rougeLsum_recall_stderr": 0.002798617263701007}}, "1": {"generate_text_restaurant": {"bleu": 8.299968276953818, "bleu_stderr": 0.12233321125733444, "rouge1_fmeasure": 0.37877480029239974, "rouge1_fmeasure_stderr": 0.0019784847616312632, "rouge1_precision": 0.37638385679182335, "rouge1_precision_stderr": 0.002317139736702885, "rouge1_recall": 0.42033889127353, "rouge1_recall_stderr": 0.002772945673846739, "rouge2_fmeasure": 0.14751847292773587, "rouge2_fmeasure_stderr": 0.0015764623094505865, "rouge2_precision": 0.14557219664635176, "rouge2_precision_stderr": 0.0016775934171071967, "rouge2_recall": 0.16609026065457985, "rouge2_recall_stderr": 0.0019605500141973544, "rougeL_fmeasure": 0.25935191152159226, "rougeL_fmeasure_stderr": 0.0015444480088272186, "rougeL_precision": 0.2580714671728175, "rougeL_precision_stderr": 0.0017766038886166667, "rougeL_recall": 0.2888777643523076, "rougeL_recall_stderr": 0.0021851921933268488, "rougeLsum_fmeasure": 0.3138145000350625, "rougeLsum_fmeasure_stderr": 0.0018697138228652053, "rougeLsum_precision": 0.31216259293253057, "rougeLsum_precision_stderr": 0.002128295675386538, "rougeLsum_recall": 0.34818131196844593, "rougeLsum_recall_stderr": 0.002527105759803588}}, "2": {"generate_text_restaurant": {"bleu": 8.32094088832304, "bleu_stderr": 0.1408456329896372, "rouge1_fmeasure": 0.3891942091560381, "rouge1_fmeasure_stderr": 0.0018863285748240007, "rouge1_precision": 0.37673820157456905, "rouge1_precision_stderr": 0.002525885889869433, "rouge1_recall": 0.45010798840015825, "rouge1_recall_stderr": 0.002660399505092188, "rouge2_fmeasure": 0.16331509152950488, "rouge2_fmeasure_stderr": 0.0015357435865889781, "rouge2_precision": 0.1588674350852028, "rouge2_precision_stderr": 0.001799580410910336, "rouge2_recall": 0.19126370921881766, "rouge2_recall_stderr": 0.001995920729312549, "rougeL_fmeasure": 0.2683563508617857, "rougeL_fmeasure_stderr": 0.0014877223321086363, "rougeL_precision": 0.25911730218786333, "rougeL_precision_stderr": 0.001898454144269244, "rougeL_recall": 0.31272395259229857, "rougeL_recall_stderr": 0.00218960828230586, "rougeLsum_fmeasure": 0.3266409070065726, "rougeLsum_fmeasure_stderr": 0.001793782232998535, "rougeLsum_precision": 0.3147064567889733, "rougeLsum_precision_stderr": 0.002200826709156465, "rougeLsum_recall": 0.3793544003572373, "rougeLsum_recall_stderr": 0.0025336496732474654}}, "3": {"generate_text_restaurant": {"bleu": 8.14287697368746, "bleu_stderr": 0.09189509790387232, "rouge1_fmeasure": 0.3833814965569996, "rouge1_fmeasure_stderr": 0.0018909883024634615, "rouge1_precision": 0.36461819536647777, "rouge1_precision_stderr": 0.0026487499518419765, "rouge1_recall": 0.4543503797519612, "rouge1_recall_stderr": 0.002582206511533246, "rouge2_fmeasure": 0.16584161893779908, "rouge2_fmeasure_stderr": 0.0015384057629127023, "rouge2_precision": 0.15958828700253225, "rouge2_precision_stderr": 0.00191846623883205, "rouge2_recall": 0.19891031000555218, "rouge2_recall_stderr": 0.0020106764519864156, "rougeL_fmeasure": 0.26759530893060857, "rougeL_fmeasure_stderr": 0.0014919222068034712, "rougeL_precision": 0.2544311472598548, "rougeL_precision_stderr": 0.002057737430115525, "rougeL_recall": 0.3197047799413762, "rougeL_recall_stderr": 0.0021685729115729683, "rougeLsum_fmeasure": 0.3261278580937773, "rougeLsum_fmeasure_stderr": 0.0017736884110960225, "rougeLsum_precision": 0.3090600248711991, "rougeLsum_precision_stderr": 0.0023136943881649544, "rougeLsum_recall": 0.38801660046844827, "rougeLsum_recall_stderr": 0.002479446128993374}}, "4": {"generate_text_restaurant": {"bleu": 8.340266904708615, "bleu_stderr": 0.15882362731641408, "rouge1_fmeasure": 0.3854358650362702, "rouge1_fmeasure_stderr": 0.0019534156712746817, "rouge1_precision": 0.37296072752446663, "rouge1_precision_stderr": 0.002869109612679922, "rouge1_recall": 0.4502609383284795, "rouge1_recall_stderr": 0.00251079382775608, "rouge2_fmeasure": 0.1702076056023139, "rouge2_fmeasure_stderr": 0.0016133056911393302, "rouge2_precision": 0.1666562401905645, "rouge2_precision_stderr": 0.002038677032860895, "rouge2_recall": 0.20086801672222046, "rouge2_recall_stderr": 0.002011662092774619, "rougeL_fmeasure": 0.2711757987789853, "rougeL_fmeasure_stderr": 0.0015777940102540016, "rougeL_precision": 0.26173580474947733, "rougeL_precision_stderr": 0.0021944891414563193, "rougeL_recall": 0.31915969202906563, "rougeL_recall_stderr": 0.002133982181573949, "rougeLsum_fmeasure": 0.32924428194860017, "rougeLsum_fmeasure_stderr": 0.0018366992292492765, "rougeLsum_precision": 0.3168756443706945, "rougeLsum_precision_stderr": 0.0024802113511425464, "rougeLsum_recall": 0.3863610636617126, "rougeLsum_recall_stderr": 0.0024230633383428616}}, "5": {"generate_text_restaurant": {"bleu": 8.533282452625134, "bleu_stderr": 0.1596869267996998, "rouge1_fmeasure": 0.387500499827696, "rouge1_fmeasure_stderr": 0.002012551837952501, "rouge1_precision": 0.3819741920769809, "rouge1_precision_stderr": 0.0030583258079532444, "rouge1_recall": 0.44635699494708264, "rouge1_recall_stderr": 0.0024922002220329363, "rouge2_fmeasure": 0.1738222237468194, "rouge2_fmeasure_stderr": 0.0016564597048057436, "rouge2_precision": 0.17385953576593263, "rouge2_precision_stderr": 0.0021587516126658113, "rouge2_recall": 0.2013810249422712, "rouge2_recall_stderr": 0.0019753430304224476, "rougeL_fmeasure": 0.275755504456721, "rougeL_fmeasure_stderr": 0.001646371536215165, "rougeL_precision": 0.2715629527377947, "rougeL_precision_stderr": 0.00239642782854299, "rougeL_recall": 0.3196239920854854, "rougeL_recall_stderr": 0.0021068495557586366, "rougeLsum_fmeasure": 0.33168731107832367, "rougeLsum_fmeasure_stderr": 0.001885184243072814, "rougeLsum_precision": 0.32541276849244294, "rougeLsum_precision_stderr": 0.002661022323727048, "rougeLsum_recall": 0.38355980857565064, "rougeLsum_recall_stderr": 0.0023839054579621287}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.704976167673198, "bleu_stderr": 0.07510580344587393, "rouge1_fmeasure": 0.20320759251494883, "rouge1_fmeasure_stderr": 0.0023711645310408895, "rouge1_precision": 0.14969414244704182, "rouge1_precision_stderr": 0.0019324896572609615, "rouge1_recall": 0.3424440467278392, "rouge1_recall_stderr": 0.0040995514469987174, "rouge2_fmeasure": 0.042153838410666736, "rouge2_fmeasure_stderr": 0.0014335873797812037, "rouge2_precision": 0.030400425095377902, "rouge2_precision_stderr": 0.001044026311849789, "rouge2_recall": 0.0739670064304767, "rouge2_recall_stderr": 0.002634989437040311, "rougeL_fmeasure": 0.15159348255296218, "rougeL_fmeasure_stderr": 0.0017876823212052383, "rougeL_precision": 0.11143432626337815, "rougeL_precision_stderr": 0.001432854409975987, "rougeL_recall": 0.25753222826743616, "rougeL_recall_stderr": 0.0033024077097477903, "rougeLsum_fmeasure": 0.15938433020387577, "rougeLsum_fmeasure_stderr": 0.0019723261078191117, "rougeLsum_precision": 0.11696758071244377, "rougeLsum_precision_stderr": 0.0015447016915361197, "rougeLsum_recall": 0.2708708922848335, "rougeLsum_recall_stderr": 0.003597404966268882}}, "1": {"article_DOC_summary": {"bleu": 0.979960173920773, "bleu_stderr": 0.05173529864353662, "rouge1_fmeasure": 0.15700622126726985, "rouge1_fmeasure_stderr": 0.0021222629533591006, "rouge1_precision": 0.11122405018195453, "rouge1_precision_stderr": 0.0015797262989822184, "rouge1_recall": 0.2782856404337479, "rouge1_recall_stderr": 0.0036415883048158764, "rouge2_fmeasure": 0.024555822633540537, "rouge2_fmeasure_stderr": 0.0010997545789997894, "rouge2_precision": 0.017149801531869974, "rouge2_precision_stderr": 0.0007672537603847499, "rouge2_recall": 0.045241611308698707, "rouge2_recall_stderr": 0.0021200374596874057, "rougeL_fmeasure": 0.12180978247573739, "rougeL_fmeasure_stderr": 0.0015608663723874288, "rougeL_precision": 0.08609418991835227, "rougeL_precision_stderr": 0.0011494360417164816, "rougeL_recall": 0.21746800955642373, "rougeL_recall_stderr": 0.002835886740287387, "rougeLsum_fmeasure": 0.12701371024428101, "rougeLsum_fmeasure_stderr": 0.0017141477451743852, "rougeLsum_precision": 0.08977133822978285, "rougeLsum_precision_stderr": 0.0012601057903106146, "rougeLsum_recall": 0.2265237802974383, "rougeLsum_recall_stderr": 0.0030682275926886926}}, "2": {"article_DOC_summary": {"bleu": 0.9592734153575632, "bleu_stderr": 0.09382162697466569, "rouge1_fmeasure": 0.15439985598556147, "rouge1_fmeasure_stderr": 0.0020685303106254766, "rouge1_precision": 0.10922125198022638, "rouge1_precision_stderr": 0.0015327381563681863, "rouge1_recall": 0.2740796461825564, "rouge1_recall_stderr": 0.0035621166880206914, "rouge2_fmeasure": 0.023587313413704444, "rouge2_fmeasure_stderr": 0.0011050020264692577, "rouge2_precision": 0.016487055335928664, "rouge2_precision_stderr": 0.0007677313542586885, "rouge2_recall": 0.04306796853828354, "rouge2_recall_stderr": 0.002091163544492895, "rougeL_fmeasure": 0.12120837216731699, "rougeL_fmeasure_stderr": 0.0015854487079549203, "rougeL_precision": 0.08557002380659585, "rougeL_precision_stderr": 0.0011627881583555256, "rougeL_recall": 0.21639702653925905, "rougeL_recall_stderr": 0.002830090102057936, "rougeLsum_fmeasure": 0.12464464617600614, "rougeLsum_fmeasure_stderr": 0.0017005400782115032, "rougeLsum_precision": 0.08798412491210018, "rougeLsum_precision_stderr": 0.001241391026067503, "rougeLsum_recall": 0.2224295384317635, "rougeLsum_recall_stderr": 0.0030332449340733547}}, "3": {"article_DOC_summary": {"bleu": 1.0250436676088148, "bleu_stderr": 0.0897345599425218, "rouge1_fmeasure": 0.15100142900440264, "rouge1_fmeasure_stderr": 0.002216876694666884, "rouge1_precision": 0.1090266212279622, "rouge1_precision_stderr": 0.001711662470692957, "rouge1_recall": 0.26435524062723414, "rouge1_recall_stderr": 0.003899243892653423, "rouge2_fmeasure": 0.02358333545532719, "rouge2_fmeasure_stderr": 0.0010856412409051479, "rouge2_precision": 0.016664166927212225, "rouge2_precision_stderr": 0.0007670588317614037, "rouge2_recall": 0.043006632410991154, "rouge2_recall_stderr": 0.002080317075223551, "rougeL_fmeasure": 0.11778480413469063, "rougeL_fmeasure_stderr": 0.0016849995779788692, "rougeL_precision": 0.08497732398758262, "rougeL_precision_stderr": 0.0012974380766172027, "rougeL_recall": 0.2073530127515781, "rougeL_recall_stderr": 0.0030747619829053943, "rougeLsum_fmeasure": 0.12273430725894952, "rougeLsum_fmeasure_stderr": 0.0018178140650586502, "rougeLsum_precision": 0.08853171692794809, "rougeLsum_precision_stderr": 0.0013913431824478968, "rougeLsum_recall": 0.21598097449057452, "rougeLsum_recall_stderr": 0.0032877874391039636}}, "4": {"article_DOC_summary": {"bleu": 0.5225577211686727, "bleu_stderr": 0.12959625082450632, "rouge1_fmeasure": 0.040937760756170734, "rouge1_fmeasure_stderr": 0.002299626505022236, "rouge1_precision": 0.03485569508169301, "rouge1_precision_stderr": 0.0021856260607702794, "rouge1_recall": 0.06463507331652409, "rouge1_recall_stderr": 0.003685938778071234, "rouge2_fmeasure": 0.006195379439697378, "rouge2_fmeasure_stderr": 0.0007036601029415052, "rouge2_precision": 0.005437259038133314, "rouge2_precision_stderr": 0.000802124167524944, "rouge2_recall": 0.010111288879263358, "rouge2_recall_stderr": 0.0011813527534910925, "rougeL_fmeasure": 0.03234035544052198, "rougeL_fmeasure_stderr": 0.001792868048264922, "rougeL_precision": 0.02802707208505075, "rougeL_precision_stderr": 0.0018488815770964972, "rougeL_recall": 0.051579755348292074, "rougeL_recall_stderr": 0.0029598852452192514, "rougeLsum_fmeasure": 0.03399814377685756, "rougeLsum_fmeasure_stderr": 0.0019035037454680388, "rougeLsum_precision": 0.02943801405378874, "rougeLsum_precision_stderr": 0.0019325677627601914, "rougeLsum_recall": 0.05386078779110535, "rougeLsum_recall_stderr": 0.003095591399792792}}, "5": {"article_DOC_summary": {"bleu": 7.180994433269743e-39, "bleu_stderr": 2.511599289322222e-33, "rouge1_fmeasure": 0.001990674361595793, "rouge1_fmeasure_stderr": 0.0005642781142620792, "rouge1_precision": 0.0022877323719954707, "rouge1_precision_stderr": 0.0006642644641306185, "rouge1_recall": 0.0018280680998604062, "rouge1_recall_stderr": 0.0005121323157465823, "rouge2_fmeasure": 0.0001287874039080709, "rouge2_fmeasure_stderr": 9.359407071565204e-05, "rouge2_precision": 0.00017152658662092626, "rouge2_precision_stderr": 0.00012780446523672577, "rouge2_recall": 0.00010414114187699093, "rouge2_recall_stderr": 7.474658294086318e-05, "rougeL_fmeasure": 0.0016185572630217646, "rougeL_fmeasure_stderr": 0.00045897887275809666, "rougeL_precision": 0.0018514296874598294, "rougeL_precision_stderr": 0.0005384819052908431, "rougeL_recall": 0.0014950663258536708, "rougeL_recall_stderr": 0.00041902435765448935, "rougeLsum_fmeasure": 0.0016694212075392255, "rougeLsum_fmeasure_stderr": 0.00047283742219582684, "rougeLsum_precision": 0.001905031745778869, "rougeLsum_precision_stderr": 0.0005530289094934512, "rougeLsum_recall": 0.001547348948891011, "rougeLsum_recall_stderr": 0.0004349724954627403}}}} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_0.csv b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..ebf61cc6b234d9303fdd9e68c89529a1d79540d3 --- /dev/null +++ b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.01488827258820393,0 +anli_r2,acc,0.35,0.015090650341444233,0 +anli_r3,acc,0.3516666666666667,0.013789711695404794,0 +arc_challenge,acc,0.25341296928327645,0.012710896778378606,0 +arc_challenge,acc_norm,0.26791808873720135,0.01294203019513643,0 +arc_easy,acc,0.5505050505050505,0.010207308833916037,0 +arc_easy,acc_norm,0.48063973063973064,0.010252089491165513,0 +boolq,acc,0.5489296636085627,0.008703080962379615,1 +cb,acc,0.5892857142857143,0.06633634150359538,1 +cb,f1,0.392018779342723,,1 +copa,acc,0.69,0.04648231987117316,0 +hellaswag,acc,0.4224258115913165,0.004929361040558255,0 +hellaswag,acc_norm,0.5451105357498506,0.004969431900874301,0 +piqa,acc,0.7295973884657236,0.010363167031620785,0 +piqa,acc_norm,0.7312295973884657,0.010343392940089995,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.785,0.012997843819031832,0 +sciq,acc_norm,0.689,0.014645596385722695,0 +storycloze_2016,acc,0.6771779796900054,0.010812153082758848,0 +winogrande,acc,0.5122336227308603,0.014048278820405612,0 diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json deleted file mode 100644 index 7d713c4b83b910bc85a8eab6ce4b93bf133f445f..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.01488827258820393 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444233 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404794 - }, - "cb": { - "acc": 0.5892857142857143, - "acc_stderr": 0.06633634150359538, - "f1": 0.392018779342723 - }, - "copa": { - "acc": 0.69, - "acc_stderr": 0.04648231987117316 - }, - "hellaswag": { - "acc": 0.4224258115913165, - "acc_stderr": 0.004929361040558255, - "acc_norm": 0.5451105357498506, - "acc_norm_stderr": 0.004969431900874301 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5122336227308603, - "acc_stderr": 0.014048278820405612 - }, - "storycloze_2016": { - "acc": 0.6771779796900054, - "acc_stderr": 0.010812153082758848 - }, - "boolq": { - "acc": 0.5489296636085627, - "acc_stderr": 0.008703080962379615 - }, - "arc_easy": { - "acc": 0.5505050505050505, - "acc_stderr": 0.010207308833916037, - "acc_norm": 0.48063973063973064, - "acc_norm_stderr": 0.010252089491165513 - }, - "arc_challenge": { - "acc": 0.25341296928327645, - "acc_stderr": 0.012710896778378606, - "acc_norm": 0.26791808873720135, - "acc_norm_stderr": 0.01294203019513643 - }, - "sciq": { - "acc": 0.785, - "acc_stderr": 0.012997843819031832, - "acc_norm": 0.689, - "acc_norm_stderr": 0.014645596385722695 - }, - "piqa": { - "acc": 0.7295973884657236, - "acc_stderr": 0.010363167031620785, - "acc_norm": 0.7312295973884657, - "acc_norm_stderr": 0.010343392940089995 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_1.csv b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..f4de62d76adbe53059bfac43121282b2ae5daa33 --- /dev/null +++ b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.328,0.014853842487270334,0 +anli_r3,acc,0.3425,0.013704669762934727,0 +arc_challenge,acc,0.257679180887372,0.012780770562768405,0 +arc_challenge,acc_norm,0.2738907849829352,0.013032004972989501,0 +arc_easy,acc,0.563973063973064,0.01017545958275974,0 +arc_easy,acc_norm,0.5244107744107744,0.010247548905242276,0 +boolq,acc,0.5412844036697247,0.008715193815788289,1 +cb,acc,0.5535714285714286,0.06703189227942395,1 +cb,f1,0.3464373464373464,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.4223262298346943,0.004929204864315973,0 +hellaswag,acc_norm,0.5416251742680741,0.004972460206842308,0 +piqa,acc,0.7306855277475517,0.010350004070588758,0 +piqa,acc_norm,0.7268770402611534,0.010395730264453258,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.829,0.011912216456264604,0 +sciq,acc_norm,0.797,0.012726073744598276,0 +storycloze_2016,acc,0.6691608765366115,0.010880601338204659,0 +winogrande,acc,0.5272296764009471,0.014031631629827696,0 diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json deleted file mode 100644 index 6b53b90bccc987b0897259cc246873aba15e0b6b..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.328, - "acc_stderr": 0.014853842487270334 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934727 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.3464373464373464 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.4223262298346943, - "acc_stderr": 0.004929204864315973, - "acc_norm": 0.5416251742680741, - "acc_norm_stderr": 0.004972460206842308 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.5272296764009471, - "acc_stderr": 0.014031631629827696 - }, - "storycloze_2016": { - "acc": 0.6691608765366115, - "acc_stderr": 0.010880601338204659 - }, - "boolq": { - "acc": 0.5412844036697247, - "acc_stderr": 0.008715193815788289 - }, - "arc_easy": { - "acc": 0.563973063973064, - "acc_stderr": 0.01017545958275974, - "acc_norm": 0.5244107744107744, - "acc_norm_stderr": 0.010247548905242276 - }, - "arc_challenge": { - "acc": 0.257679180887372, - "acc_stderr": 0.012780770562768405, - "acc_norm": 0.2738907849829352, - "acc_norm_stderr": 0.013032004972989501 - }, - "sciq": { - "acc": 0.829, - "acc_stderr": 0.011912216456264604, - "acc_norm": 0.797, - "acc_norm_stderr": 0.012726073744598276 - }, - "piqa": { - "acc": 0.7306855277475517, - "acc_stderr": 0.010350004070588758, - "acc_norm": 0.7268770402611534, - "acc_norm_stderr": 0.010395730264453258 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_2.csv b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..8170d78be550854ae09546bb2e4bcd9c5d3a1ed7 --- /dev/null +++ b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.344,0.015029633724408945,0 +anli_r2,acc,0.353,0.015120172605483696,0 +anli_r3,acc,0.335,0.013630871843821476,0 +arc_challenge,acc,0.24914675767918087,0.012639407111926437,0 +arc_challenge,acc_norm,0.27047781569965873,0.012980954547659554,0 +arc_easy,acc,0.5597643097643098,0.010186228624515651,0 +arc_easy,acc_norm,0.5446127946127947,0.010218861787618728,0 +boolq,acc,0.5333333333333333,0.008725599880049204,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.22811671087533156,,1 +copa,acc,0.68,0.046882617226215034,0 +hellaswag,acc,0.4190400318661621,0.004923935749842497,0 +hellaswag,acc_norm,0.5415255925114519,0.0049725431277678825,0 +piqa,acc,0.7219804134929271,0.010453117358332811,0 +piqa,acc_norm,0.7285092491838956,0.010376251176596135,0 +rte,acc,0.5018050541516246,0.030096267148976633,0 +sciq,acc,0.851,0.011266140684632171,0 +sciq,acc_norm,0.827,0.01196721413755994,0 +storycloze_2016,acc,0.6712987707108499,0.010862700030538157,0 +winogrande,acc,0.5272296764009471,0.014031631629827698,0 diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json deleted file mode 100644 index ca5ea553c9d8c7d2ee0c6b0d6c46a97f02cb391f..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408945 - }, - "anli_r2": { - "acc": 0.353, - "acc_stderr": 0.015120172605483696 - }, - "anli_r3": { - "acc": 0.335, - "acc_stderr": 0.013630871843821476 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.22811671087533156 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.046882617226215034 - }, - "hellaswag": { - "acc": 0.4190400318661621, - "acc_stderr": 0.004923935749842497, - "acc_norm": 0.5415255925114519, - "acc_norm_stderr": 0.0049725431277678825 - }, - "rte": { - "acc": 0.5018050541516246, - "acc_stderr": 0.030096267148976633 - }, - "winogrande": { - "acc": 0.5272296764009471, - "acc_stderr": 0.014031631629827698 - }, - "storycloze_2016": { - "acc": 0.6712987707108499, - "acc_stderr": 0.010862700030538157 - }, - "boolq": { - "acc": 0.5333333333333333, - "acc_stderr": 0.008725599880049204 - }, - "arc_easy": { - "acc": 0.5597643097643098, - "acc_stderr": 0.010186228624515651, - "acc_norm": 0.5446127946127947, - "acc_norm_stderr": 0.010218861787618728 - }, - "arc_challenge": { - "acc": 0.24914675767918087, - "acc_stderr": 0.012639407111926437, - "acc_norm": 0.27047781569965873, - "acc_norm_stderr": 0.012980954547659554 - }, - "sciq": { - "acc": 0.851, - "acc_stderr": 0.011266140684632171, - "acc_norm": 0.827, - "acc_norm_stderr": 0.01196721413755994 - }, - "piqa": { - "acc": 0.7219804134929271, - "acc_stderr": 0.010453117358332811, - "acc_norm": 0.7285092491838956, - "acc_norm_stderr": 0.010376251176596135 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_3.csv b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..ec2e166e612ce2aaf756da31d48833d65704f322 --- /dev/null +++ b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.014794927843348633,0 +anli_r2,acc,0.346,0.015050266127564446,0 +anli_r3,acc,0.35833333333333334,0.013848054140053427,0 +arc_challenge,acc,0.24914675767918087,0.012639407111926435,0 +arc_challenge,acc_norm,0.26535836177474403,0.012902554762313967,0 +arc_easy,acc,0.5568181818181818,0.010193324837773493,0 +arc_easy,acc_norm,0.5555555555555556,0.010196254838691682,0 +boolq,acc,0.5321100917431193,0.008727003026917805,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.26702833031946954,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.42093208524198367,0.00492699683019424,0 +hellaswag,acc_norm,0.5461063533160725,0.004968521608065464,0 +piqa,acc,0.7285092491838956,0.010376251176596135,0 +piqa,acc_norm,0.7257889009793254,0.010408618664933382,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.853,0.011203415395160331,0 +sciq,acc_norm,0.823,0.012075463420375061,0 +storycloze_2016,acc,0.677712453233565,0.010807461374996363,0 +winogrande,acc,0.5303867403314917,0.01402651083942874,0 diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json deleted file mode 100644 index 4b548ea4c045ec1a98e87109b14efadacad57fb8..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_3_lm-eval_global_step52452_2023-02-15-00-33-59_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348633 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564446 - }, - "anli_r3": { - "acc": 0.35833333333333334, - "acc_stderr": 0.013848054140053427 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.26702833031946954 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.42093208524198367, - "acc_stderr": 0.00492699683019424, - "acc_norm": 0.5461063533160725, - "acc_norm_stderr": 0.004968521608065464 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.5303867403314917, - "acc_stderr": 0.01402651083942874 - }, - "storycloze_2016": { - "acc": 0.677712453233565, - "acc_stderr": 0.010807461374996363 - }, - "boolq": { - "acc": 0.5321100917431193, - "acc_stderr": 0.008727003026917805 - }, - "arc_easy": { - "acc": 0.5568181818181818, - "acc_stderr": 0.010193324837773493, - "acc_norm": 0.5555555555555556, - "acc_norm_stderr": 0.010196254838691682 - }, - "arc_challenge": { - "acc": 0.24914675767918087, - "acc_stderr": 0.012639407111926435, - "acc_norm": 0.26535836177474403, - "acc_norm_stderr": 0.012902554762313967 - }, - "sciq": { - "acc": 0.853, - "acc_stderr": 0.011203415395160331, - "acc_norm": 0.823, - "acc_norm_stderr": 0.012075463420375061 - }, - "piqa": { - "acc": 0.7285092491838956, - "acc_stderr": 0.010376251176596135, - "acc_norm": 0.7257889009793254, - "acc_norm_stderr": 0.010408618664933382 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_4.csv b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..82f289aceefc645e0cba8388f10cbad50964f19c --- /dev/null +++ b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.338,0.014965960710224487,0 +anli_r2,acc,0.346,0.01505026612756445,0 +anli_r3,acc,0.3425,0.013704669762934725,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252427,0 +arc_easy,acc,0.5589225589225589,0.01018829322104057,0 +arc_easy,acc_norm,0.5542929292929293,0.010199118183322989,0 +boolq,acc,0.5370030581039755,0.008721074177479657,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.29907407407407405,,1 +copa,acc,0.72,0.04512608598542126,0 +hellaswag,acc,0.4208325034853615,0.004926837572202165,0 +hellaswag,acc_norm,0.5493925512846046,0.0049653753416431376,0 +piqa,acc,0.7252448313384113,0.010415033676676042,0 +piqa,acc_norm,0.719260065288357,0.010484325438311827,0 +rte,acc,0.5270758122743683,0.0300523034631437,0 +sciq,acc,0.858,0.011043457699378232,0 +sciq,acc_norm,0.849,0.011328165223341678,0 +storycloze_2016,acc,0.6787814003206841,0.010798029402794916,0 +winogrande,acc,0.510655090765588,0.014049294536290396,0 diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json deleted file mode 100644 index 882eafd0f63e5760fb8562622921169173e0bb46..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.014965960710224487 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.01505026612756445 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934725 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.29907407407407405 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542126 - }, - "hellaswag": { - "acc": 0.4208325034853615, - "acc_stderr": 0.004926837572202165, - "acc_norm": 0.5493925512846046, - "acc_norm_stderr": 0.0049653753416431376 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.0300523034631437 - }, - "winogrande": { - "acc": 0.510655090765588, - "acc_stderr": 0.014049294536290396 - }, - "storycloze_2016": { - "acc": 0.6787814003206841, - "acc_stderr": 0.010798029402794916 - }, - "boolq": { - "acc": 0.5370030581039755, - "acc_stderr": 0.008721074177479657 - }, - "arc_easy": { - "acc": 0.5589225589225589, - "acc_stderr": 0.01018829322104057, - "acc_norm": 0.5542929292929293, - "acc_norm_stderr": 0.010199118183322989 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252427 - }, - "sciq": { - "acc": 0.858, - "acc_stderr": 0.011043457699378232, - "acc_norm": 0.849, - "acc_norm_stderr": 0.011328165223341678 - }, - "piqa": { - "acc": 0.7252448313384113, - "acc_stderr": 0.010415033676676042, - "acc_norm": 0.719260065288357, - "acc_norm_stderr": 0.010484325438311827 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_5.csv b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..bea89ffba550c1b9348e4445980a1c7da477208b --- /dev/null +++ b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.358,0.01516792886540756,0 +anli_r2,acc,0.346,0.015050266127564448,0 +anli_r3,acc,0.3525,0.013797164918918364,0 +arc_challenge,acc,0.2627986348122867,0.012862523175351331,0 +arc_challenge,acc_norm,0.2841296928327645,0.013179442447653886,0 +arc_easy,acc,0.5702861952861953,0.010157908005763676,0 +arc_easy,acc_norm,0.561026936026936,0.010183076012972057,0 +boolq,acc,0.5357798165137615,0.008722635482201091,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.28173472617917067,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.41963752240589525,0.004924910433106355,0 +hellaswag,acc_norm,0.5517825134435371,0.004962949784236046,0 +piqa,acc,0.7306855277475517,0.010350004070588758,0 +piqa,acc_norm,0.7236126224156693,0.010434162388275601,0 +rte,acc,0.5812274368231047,0.02969666108123482,0 +sciq,acc,0.859,0.01101091459599244,0 +sciq,acc_norm,0.85,0.011297239823409293,0 +storycloze_2016,acc,0.6755745590593266,0.010826131344990893,0 +winogrande,acc,0.5295974743488555,0.014027843827840083,0 diff --git a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json b/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json deleted file mode 100644 index 75abae921a3e93a1fd509bec763bc35b33f54ea3..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed1/evaluation/rankeval/2b855b4bc4seed1_5_lm-eval_global_step52452_2023-02-15-00-33-59_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.358, - "acc_stderr": 0.01516792886540756 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564448 - }, - "anli_r3": { - "acc": 0.3525, - "acc_stderr": 0.013797164918918364 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.28173472617917067 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.41963752240589525, - "acc_stderr": 0.004924910433106355, - "acc_norm": 0.5517825134435371, - "acc_norm_stderr": 0.004962949784236046 - }, - "rte": { - "acc": 0.5812274368231047, - "acc_stderr": 0.02969666108123482 - }, - "winogrande": { - "acc": 0.5295974743488555, - "acc_stderr": 0.014027843827840083 - }, - "storycloze_2016": { - "acc": 0.6755745590593266, - "acc_stderr": 0.010826131344990893 - }, - "boolq": { - "acc": 0.5357798165137615, - "acc_stderr": 0.008722635482201091 - }, - "arc_easy": { - "acc": 0.5702861952861953, - "acc_stderr": 0.010157908005763676, - "acc_norm": 0.561026936026936, - "acc_norm_stderr": 0.010183076012972057 - }, - "arc_challenge": { - "acc": 0.2627986348122867, - "acc_stderr": 0.012862523175351331, - "acc_norm": 0.2841296928327645, - "acc_norm_stderr": 0.013179442447653886 - }, - "sciq": { - "acc": 0.859, - "acc_stderr": 0.01101091459599244, - "acc_norm": 0.85, - "acc_norm_stderr": 0.011297239823409293 - }, - "piqa": { - "acc": 0.7306855277475517, - "acc_stderr": 0.010350004070588758, - "acc_norm": 0.7236126224156693, - "acc_norm_stderr": 0.010434162388275601 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/generation/merged.csv b/2b855b4bc4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..366d901360f0c0d80c3e41fd8801e54888f0ad46 --- /dev/null +++ b/2b855b4bc4seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.01946643931866803 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.01946643931866803 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1207172260072706 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1207172260072706 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.15873688166583041 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.15873688166583041 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.1800347551029311 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.1800347551029311 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.18523471171404504 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18523471171404504 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1867641974424367 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1867641974424367 +e2e_nlg_cleaned,5,average,multiple,0.14182570187519697 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04098024232918615 +gem_xsum,0,median,rouge2_fmeasure,0.04098024232918615 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.024293197626076832 +gem_xsum,1,median,rouge2_fmeasure,0.024293197626076832 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.024249184932784353 +gem_xsum,2,median,rouge2_fmeasure,0.024249184932784353 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.02217727793566935 +gem_xsum,3,median,rouge2_fmeasure,0.02217727793566935 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0060094374125094045 +gem_xsum,4,median,rouge2_fmeasure,0.0060094374125094045 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001717122214549316 +gem_xsum,5,median,rouge2_fmeasure,0.0001717122214549316 +gem_xsum,5,average,multiple,0.01964684207628017 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05405465170623507 +web_nlg_en,0,median,rouge2_fmeasure,0.05405465170623507 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.04972414183277028 +web_nlg_en,1,median,rouge2_fmeasure,0.04972414183277028 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05233199147111919 +web_nlg_en,2,median,rouge2_fmeasure,0.05233199147111919 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.053640775172384275 +web_nlg_en,3,median,rouge2_fmeasure,0.053640775172384275 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05154135360572808 +web_nlg_en,4,median,rouge2_fmeasure,0.05154135360572808 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05502925181969434 +web_nlg_en,5,median,rouge2_fmeasure,0.05502925181969434 +web_nlg_en,5,average,multiple,0.05272036093465521 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03006319491141446 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03006319491141446 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04126993345622598 +wiki_lingua_en,1,median,rouge2_fmeasure,0.04126993345622598 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04483999125400811 +wiki_lingua_en,2,median,rouge2_fmeasure,0.04483999125400811 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03906381568194331 +wiki_lingua_en,3,median,rouge2_fmeasure,0.03906381568194331 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012683758363803145 +wiki_lingua_en,4,median,rouge2_fmeasure,0.012683758363803145 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020664571727675384 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0020664571727675384 +wiki_lingua_en,5,average,multiple,0.028331191806693756 diff --git a/2b855b4bc4seed2/evaluation/generation/merged.json b/2b855b4bc4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..7be55bd3c78936a9b9fb68b13dc70445f802cc39 --- /dev/null +++ b/2b855b4bc4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.37516249318160555, "bleu_stderr": 0.03725430404871649, "rouge1_fmeasure": 0.11258477245803407, "rouge1_fmeasure_stderr": 0.002216239369057944, "rouge1_precision": 0.07616156236466372, "rouge1_precision_stderr": 0.0019536354727816042, "rouge1_recall": 0.31631197068415395, "rouge1_recall_stderr": 0.0048717251484713875, "rouge2_fmeasure": 0.05405465170623507, "rouge2_fmeasure_stderr": 0.0014172312593382934, "rouge2_precision": 0.036674079164111424, "rouge2_precision_stderr": 0.0012860256748924669, "rouge2_recall": 0.15472499280625326, "rouge2_recall_stderr": 0.0033984269420311145, "rougeL_fmeasure": 0.10664260091985779, "rougeL_fmeasure_stderr": 0.0020121377155400185, "rougeL_precision": 0.07176399001439152, "rougeL_precision_stderr": 0.0017792296639138315, "rougeL_recall": 0.3037119746850792, "rougeL_recall_stderr": 0.004740642058045769, "rougeLsum_fmeasure": 0.10576478746330406, "rougeLsum_fmeasure_stderr": 0.0020692845023136193, "rougeLsum_precision": 0.07163396173839576, "rougeLsum_precision_stderr": 0.0018509534932570329, "rougeLsum_recall": 0.2966713188309255, "rougeLsum_recall_stderr": 0.00452552483650827}}, "1": {"PALM_prompt": {"bleu": 0.41452746154833603, "bleu_stderr": 0.033072846214980314, "rouge1_fmeasure": 0.10931056860076818, "rouge1_fmeasure_stderr": 0.0019262739374116611, "rouge1_precision": 0.06988316236782911, "rouge1_precision_stderr": 0.0014122880779461023, "rouge1_recall": 0.3520271865071913, "rouge1_recall_stderr": 0.0052094341040695834, "rouge2_fmeasure": 0.04972414183277028, "rouge2_fmeasure_stderr": 0.0012063704558010941, "rouge2_precision": 0.03178399358933365, "rouge2_precision_stderr": 0.0008621494769483743, "rouge2_recall": 0.1651778391467833, "rouge2_recall_stderr": 0.0035209419087233766, "rougeL_fmeasure": 0.10105558919092507, "rougeL_fmeasure_stderr": 0.0017216512808509597, "rougeL_precision": 0.06451663789488651, "rougeL_precision_stderr": 0.0012568357646516105, "rougeL_recall": 0.3265269999934637, "rougeL_recall_stderr": 0.004748465581686739, "rougeLsum_fmeasure": 0.10368185436476693, "rougeLsum_fmeasure_stderr": 0.0018229096629970086, "rougeLsum_precision": 0.06636324781856313, "rougeLsum_precision_stderr": 0.0013409489164363578, "rougeLsum_recall": 0.3326061393871403, "rougeLsum_recall_stderr": 0.0048249589808027045}}, "2": {"PALM_prompt": {"bleu": 0.4478475765238552, "bleu_stderr": 0.020679810031729366, "rouge1_fmeasure": 0.11414238206007953, "rouge1_fmeasure_stderr": 0.0018438640059437164, "rouge1_precision": 0.07295531810707717, "rouge1_precision_stderr": 0.0013909049423640753, "rouge1_recall": 0.37125637456684, "rouge1_recall_stderr": 0.005135464726215005, "rouge2_fmeasure": 0.05233199147111919, "rouge2_fmeasure_stderr": 0.0011705259809955642, "rouge2_precision": 0.03342269677424361, "rouge2_precision_stderr": 0.0008557904609292308, "rouge2_recall": 0.17889831177670526, "rouge2_recall_stderr": 0.0036505673371000076, "rougeL_fmeasure": 0.10516935793044706, "rougeL_fmeasure_stderr": 0.001657601391360008, "rougeL_precision": 0.06717991708867377, "rougeL_precision_stderr": 0.001245124354600545, "rougeL_recall": 0.34183794737758194, "rougeL_recall_stderr": 0.004647287509466958, "rougeLsum_fmeasure": 0.10814246827203933, "rougeLsum_fmeasure_stderr": 0.0017464750270096022, "rougeLsum_precision": 0.06922150091239954, "rougeLsum_precision_stderr": 0.0013236829188076847, "rougeLsum_recall": 0.3497863647501044, "rougeLsum_recall_stderr": 0.00473764664879497}}, "3": {"PALM_prompt": {"bleu": 0.5034889672472006, "bleu_stderr": 0.025586230128342272, "rouge1_fmeasure": 0.1161707943807345, "rouge1_fmeasure_stderr": 0.001869104342988599, "rouge1_precision": 0.07428349552517428, "rouge1_precision_stderr": 0.001441017671471708, "rouge1_recall": 0.37697250173714886, "rouge1_recall_stderr": 0.005089516188289117, "rouge2_fmeasure": 0.053640775172384275, "rouge2_fmeasure_stderr": 0.0012031296165399552, "rouge2_precision": 0.034270303825389536, "rouge2_precision_stderr": 0.0008906550546184829, "rouge2_recall": 0.1824633090515, "rouge2_recall_stderr": 0.0035890506141956937, "rougeL_fmeasure": 0.1067099016352893, "rougeL_fmeasure_stderr": 0.001670433243008805, "rougeL_precision": 0.06815997829683317, "rougeL_precision_stderr": 0.0012682217921990918, "rougeL_recall": 0.34657883659538236, "rougeL_recall_stderr": 0.004596212478579083, "rougeLsum_fmeasure": 0.11002849219012008, "rougeLsum_fmeasure_stderr": 0.0017578542494400985, "rougeLsum_precision": 0.07038442220305176, "rougeLsum_precision_stderr": 0.0013444234999027572, "rougeLsum_recall": 0.35608727397135903, "rougeLsum_recall_stderr": 0.004705658093706774}}, "4": {"PALM_prompt": {"bleu": 0.5048431869054169, "bleu_stderr": 0.02434442116430955, "rouge1_fmeasure": 0.11353570672868046, "rouge1_fmeasure_stderr": 0.0017683814283851853, "rouge1_precision": 0.07210198703469012, "rouge1_precision_stderr": 0.00132009076247589, "rouge1_recall": 0.3759880238972655, "rouge1_recall_stderr": 0.005142053794614162, "rouge2_fmeasure": 0.05154135360572808, "rouge2_fmeasure_stderr": 0.001116013949220999, "rouge2_precision": 0.032557961117684935, "rouge2_precision_stderr": 0.000789994442456243, "rouge2_recall": 0.18068071587872336, "rouge2_recall_stderr": 0.0036232010571203514, "rougeL_fmeasure": 0.10331614009925619, "rougeL_fmeasure_stderr": 0.001593817388019866, "rougeL_precision": 0.06557741254617991, "rougeL_precision_stderr": 0.001178582447983872, "rougeL_recall": 0.3409869889741015, "rougeL_recall_stderr": 0.004545680864729979, "rougeLsum_fmeasure": 0.10676658119707082, "rougeLsum_fmeasure_stderr": 0.0016627984388464913, "rougeLsum_precision": 0.06778359895688292, "rougeLsum_precision_stderr": 0.0012347757820026315, "rougeLsum_recall": 0.35303946692767074, "rougeLsum_recall_stderr": 0.0047472742145304074}}, "5": {"PALM_prompt": {"bleu": 0.5795275090362991, "bleu_stderr": 0.039285670070425056, "rouge1_fmeasure": 0.11938407940958841, "rouge1_fmeasure_stderr": 0.0018668942153975093, "rouge1_precision": 0.07598890560555023, "rouge1_precision_stderr": 0.0014306482479636419, "rouge1_recall": 0.3957792356120826, "rouge1_recall_stderr": 0.005188791322617074, "rouge2_fmeasure": 0.05502925181969434, "rouge2_fmeasure_stderr": 0.0012140869247319417, "rouge2_precision": 0.03501175670600515, "rouge2_precision_stderr": 0.0009074008709786058, "rouge2_recall": 0.19207922591859808, "rouge2_recall_stderr": 0.0036758936213242884, "rougeL_fmeasure": 0.10809770458520097, "rougeL_fmeasure_stderr": 0.001647502833392072, "rougeL_precision": 0.06879655860973856, "rougeL_precision_stderr": 0.0012615638512354452, "rougeL_recall": 0.35778037702192333, "rougeL_recall_stderr": 0.004556501063946681, "rougeLsum_fmeasure": 0.11233962218995018, "rougeLsum_fmeasure_stderr": 0.0017469526504193533, "rougeLsum_precision": 0.07154945009969603, "rougeLsum_precision_stderr": 0.0013391463113787488, "rougeLsum_recall": 0.37176729162891425, "rougeLsum_recall_stderr": 0.004780732995141851}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.3029059798903195, "bleu_stderr": 0.046578732129754365, "rouge1_fmeasure": 0.16310670488210566, "rouge1_fmeasure_stderr": 0.0017810116631883403, "rouge1_precision": 0.138975970921875, "rouge1_precision_stderr": 0.0017810425696214893, "rouge1_recall": 0.23779368721493221, "rouge1_recall_stderr": 0.002627275988742985, "rouge2_fmeasure": 0.03006319491141446, "rouge2_fmeasure_stderr": 0.0007580452438252258, "rouge2_precision": 0.02529302178649697, "rouge2_precision_stderr": 0.0006622308826581347, "rouge2_recall": 0.04573672091663288, "rouge2_recall_stderr": 0.001277663033673529, "rougeL_fmeasure": 0.12868324777888854, "rougeL_fmeasure_stderr": 0.0012900584205668409, "rougeL_precision": 0.10849433436089662, "rougeL_precision_stderr": 0.0012658440867435025, "rougeL_recall": 0.19175489122136818, "rougeL_recall_stderr": 0.0021109155396307365, "rougeLsum_fmeasure": 0.15106025276814786, "rougeLsum_fmeasure_stderr": 0.001641152736645161, "rougeLsum_precision": 0.1286141329852268, "rougeLsum_precision_stderr": 0.0016415162395616372, "rougeLsum_recall": 0.22071002215347388, "rougeLsum_recall_stderr": 0.002438109388749605}}, "1": {"tldr_en": {"bleu": 2.0198334532518154, "bleu_stderr": 0.06278418176557252, "rouge1_fmeasure": 0.19373986459558043, "rouge1_fmeasure_stderr": 0.001874277747989523, "rouge1_precision": 0.16679588594664752, "rouge1_precision_stderr": 0.001966832867544324, "rouge1_recall": 0.2801052042760329, "rouge1_recall_stderr": 0.002700327810546546, "rouge2_fmeasure": 0.04126993345622598, "rouge2_fmeasure_stderr": 0.0009148385917477017, "rouge2_precision": 0.03538029526635071, "rouge2_precision_stderr": 0.0008314034343890925, "rouge2_recall": 0.06152867812379509, "rouge2_recall_stderr": 0.001520633674778812, "rougeL_fmeasure": 0.1408304606586143, "rougeL_fmeasure_stderr": 0.0012734775886041698, "rougeL_precision": 0.12004112869063698, "rougeL_precision_stderr": 0.001323881030199995, "rougeL_recall": 0.20880168304592367, "rougeL_recall_stderr": 0.002101228724181874, "rougeLsum_fmeasure": 0.18214103404640877, "rougeLsum_fmeasure_stderr": 0.0017519117643348603, "rougeLsum_precision": 0.1565574306771253, "rougeLsum_precision_stderr": 0.00183325128669811, "rougeLsum_recall": 0.26435424655320383, "rougeLsum_recall_stderr": 0.002565944836177731}}, "2": {"tldr_en": {"bleu": 2.265843481554971, "bleu_stderr": 0.04799414248307766, "rouge1_fmeasure": 0.20405373487144418, "rouge1_fmeasure_stderr": 0.0018604941963304442, "rouge1_precision": 0.17738493368711764, "rouge1_precision_stderr": 0.0020390778982164117, "rouge1_recall": 0.29235012140800787, "rouge1_recall_stderr": 0.002639871189290392, "rouge2_fmeasure": 0.04483999125400811, "rouge2_fmeasure_stderr": 0.0009315227270254619, "rouge2_precision": 0.038870810291485945, "rouge2_precision_stderr": 0.0008727966633756155, "rouge2_recall": 0.06651635285730527, "rouge2_recall_stderr": 0.0015766492359726362, "rougeL_fmeasure": 0.1468635358861475, "rougeL_fmeasure_stderr": 0.0012701984568338426, "rougeL_precision": 0.12636493466234724, "rougeL_precision_stderr": 0.0013691823654402595, "rougeL_recall": 0.21576881990700142, "rougeL_recall_stderr": 0.0020948339442364147, "rougeLsum_fmeasure": 0.19182932756562945, "rougeLsum_fmeasure_stderr": 0.0017467063780015946, "rougeLsum_precision": 0.166561326705652, "rougeLsum_precision_stderr": 0.001910374257076865, "rougeLsum_recall": 0.2754013574129904, "rougeLsum_recall_stderr": 0.0025030792401313866}}, "3": {"tldr_en": {"bleu": 2.364393997119503, "bleu_stderr": 0.06850015909771101, "rouge1_fmeasure": 0.1713794708380661, "rouge1_fmeasure_stderr": 0.002168848215407466, "rouge1_precision": 0.15918316767688395, "rouge1_precision_stderr": 0.0025768960045349683, "rouge1_recall": 0.24426768042761024, "rouge1_recall_stderr": 0.0031766865065580474, "rouge2_fmeasure": 0.03906381568194331, "rouge2_fmeasure_stderr": 0.0009295521437750448, "rouge2_precision": 0.03668261391375714, "rouge2_precision_stderr": 0.0011164660633241534, "rouge2_recall": 0.0573993015182471, "rouge2_recall_stderr": 0.0015283758768778288, "rougeL_fmeasure": 0.12405347650379281, "rougeL_fmeasure_stderr": 0.0015272755617330915, "rougeL_precision": 0.11594749741913975, "rougeL_precision_stderr": 0.0019766269623558074, "rougeL_recall": 0.18010793859343813, "rougeL_recall_stderr": 0.0024315704429741553, "rougeLsum_fmeasure": 0.16141891923887208, "rougeLsum_fmeasure_stderr": 0.0020384371143834226, "rougeLsum_precision": 0.15000959315896895, "rougeLsum_precision_stderr": 0.002440793563712419, "rougeLsum_recall": 0.2304315165685809, "rougeLsum_recall_stderr": 0.003002002405225945}}, "4": {"tldr_en": {"bleu": 0.5195262580663632, "bleu_stderr": 0.03939747655122071, "rouge1_fmeasure": 0.05615887235243026, "rouge1_fmeasure_stderr": 0.0018963743685809216, "rouge1_precision": 0.052533439762248134, "rouge1_precision_stderr": 0.001991441783154726, "rouge1_recall": 0.08284325304787615, "rouge1_recall_stderr": 0.0028337931739627602, "rouge2_fmeasure": 0.012683758363803145, "rouge2_fmeasure_stderr": 0.0006233253944136847, "rouge2_precision": 0.012087381785273422, "rouge2_precision_stderr": 0.0007394473914769437, "rouge2_recall": 0.019353016669740073, "rouge2_recall_stderr": 0.0010280257756189173, "rougeL_fmeasure": 0.04147035269891886, "rougeL_fmeasure_stderr": 0.0013854142055160317, "rougeL_precision": 0.03892101877727309, "rougeL_precision_stderr": 0.0015058143246837348, "rougeL_recall": 0.062486990168287834, "rougeL_recall_stderr": 0.002173822112261541, "rougeLsum_fmeasure": 0.052311985327473956, "rougeLsum_fmeasure_stderr": 0.0017652906836829396, "rougeLsum_precision": 0.04901282291494968, "rougeLsum_precision_stderr": 0.0018674149276485955, "rougeLsum_recall": 0.07721212182337703, "rougeLsum_recall_stderr": 0.002645165413211024}}, "5": {"tldr_en": {"bleu": 1.6827562344702503e-06, "bleu_stderr": 3.541856160800471e-06, "rouge1_fmeasure": 0.009003330276868858, "rouge1_fmeasure_stderr": 0.0008538448246305624, "rouge1_precision": 0.008660668336066173, "rouge1_precision_stderr": 0.0008969133080978192, "rouge1_recall": 0.013369642662239885, "rouge1_recall_stderr": 0.0012726067243266036, "rouge2_fmeasure": 0.0020664571727675384, "rouge2_fmeasure_stderr": 0.0002632128439215759, "rouge2_precision": 0.0020060586123193586, "rouge2_precision_stderr": 0.00028611755488053877, "rouge2_recall": 0.0031460047363294622, "rouge2_recall_stderr": 0.0004204822511919191, "rougeL_fmeasure": 0.006678718787016958, "rougeL_fmeasure_stderr": 0.0006281571319252547, "rougeL_precision": 0.006448018770799836, "rougeL_precision_stderr": 0.0006707529865298435, "rougeL_recall": 0.010046253934232799, "rougeL_recall_stderr": 0.0009587944921195498, "rougeLsum_fmeasure": 0.008345968953951927, "rougeLsum_fmeasure_stderr": 0.0007915942376671939, "rougeLsum_precision": 0.008068304074380759, "rougeLsum_precision_stderr": 0.0008413669320006186, "rougeLsum_recall": 0.012432432426944097, "rougeLsum_recall_stderr": 0.0011903718617197502}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.124328753341782, "bleu_stderr": 0.05406501685486947, "rouge1_fmeasure": 0.09198549145786886, "rouge1_fmeasure_stderr": 0.001827857204190341, "rouge1_precision": 0.07537372447465332, "rouge1_precision_stderr": 0.0017362152834112278, "rouge1_recall": 0.13295790557487902, "rouge1_recall_stderr": 0.0024452128942610133, "rouge2_fmeasure": 0.01946643931866803, "rouge2_fmeasure_stderr": 0.0008051260721889828, "rouge2_precision": 0.016678181983838202, "rouge2_precision_stderr": 0.0007531357933848652, "rouge2_recall": 0.026643094738003435, "rouge2_recall_stderr": 0.0010954788605203481, "rougeL_fmeasure": 0.08436877016730254, "rougeL_fmeasure_stderr": 0.0015674045720625268, "rougeL_precision": 0.0683353587637062, "rougeL_precision_stderr": 0.0014417477504321229, "rougeL_recall": 0.12333437089751408, "rougeL_recall_stderr": 0.002152276497377831, "rougeLsum_fmeasure": 0.080096683792661, "rougeLsum_fmeasure_stderr": 0.0015912704762644358, "rougeLsum_precision": 0.06562373030827319, "rougeLsum_precision_stderr": 0.0015239883068621095, "rougeLsum_recall": 0.11614379920488621, "rougeLsum_recall_stderr": 0.0021269432179319083}}, "1": {"generate_text_restaurant": {"bleu": 5.082149173269909, "bleu_stderr": 0.0813244408770294, "rouge1_fmeasure": 0.30864493845767316, "rouge1_fmeasure_stderr": 0.0020365218105706305, "rouge1_precision": 0.25722251185581524, "rouge1_precision_stderr": 0.0023311243415687884, "rouge1_recall": 0.43748585947603214, "rouge1_recall_stderr": 0.002685693051635844, "rouge2_fmeasure": 0.1207172260072706, "rouge2_fmeasure_stderr": 0.0013754791721349495, "rouge2_precision": 0.10036847523705841, "rouge2_precision_stderr": 0.0013851806300992837, "rouge2_recall": 0.17449570571851056, "rouge2_recall_stderr": 0.0019826916500543116, "rougeL_fmeasure": 0.24773314584121203, "rougeL_fmeasure_stderr": 0.0014237561138094287, "rougeL_precision": 0.20357830885937783, "rougeL_precision_stderr": 0.0015912427888804459, "rougeL_recall": 0.35845292565643094, "rougeL_recall_stderr": 0.002305030632329364, "rougeLsum_fmeasure": 0.24916106748055827, "rougeLsum_fmeasure_stderr": 0.0018546119696254309, "rougeLsum_precision": 0.20763305550159702, "rougeLsum_precision_stderr": 0.0020342490049155876, "rougeLsum_recall": 0.3539189245160475, "rougeLsum_recall_stderr": 0.0025332618094683233}}, "2": {"generate_text_restaurant": {"bleu": 7.1070444075388215, "bleu_stderr": 0.1458503325729145, "rouge1_fmeasure": 0.37176240718167497, "rouge1_fmeasure_stderr": 0.002263788175442525, "rouge1_precision": 0.3471468025074536, "rouge1_precision_stderr": 0.002928519407597079, "rouge1_recall": 0.454908579684014, "rouge1_recall_stderr": 0.0027034728544148494, "rouge2_fmeasure": 0.15873688166583041, "rouge2_fmeasure_stderr": 0.0016232323689229652, "rouge2_precision": 0.1481190201719996, "rouge2_precision_stderr": 0.0017897719201289616, "rouge2_recall": 0.1968812341089603, "rouge2_recall_stderr": 0.0020639601495663833, "rougeL_fmeasure": 0.27623151033138665, "rougeL_fmeasure_stderr": 0.0016527323331937437, "rougeL_precision": 0.25567621849199923, "rougeL_precision_stderr": 0.0021239715957507344, "rougeL_recall": 0.3438410947514222, "rougeL_recall_stderr": 0.0022787544525951177, "rougeLsum_fmeasure": 0.30550364486434745, "rougeLsum_fmeasure_stderr": 0.002104095988367069, "rougeLsum_precision": 0.28522867320862594, "rougeLsum_precision_stderr": 0.002581047894924107, "rougeLsum_recall": 0.3743581815037791, "rougeLsum_recall_stderr": 0.0025858782633611773}}, "3": {"generate_text_restaurant": {"bleu": 8.768437902160239, "bleu_stderr": 0.13435357597099454, "rouge1_fmeasure": 0.404157448113894, "rouge1_fmeasure_stderr": 0.0020589327973415056, "rouge1_precision": 0.3936119513800254, "rouge1_precision_stderr": 0.0027812000103646824, "rouge1_recall": 0.4632093681098066, "rouge1_recall_stderr": 0.002683540831503137, "rouge2_fmeasure": 0.1800347551029311, "rouge2_fmeasure_stderr": 0.0016947029129108552, "rouge2_precision": 0.17544888277807247, "rouge2_precision_stderr": 0.0019286502065645333, "rouge2_recall": 0.20847378423473698, "rouge2_recall_stderr": 0.002083206557838331, "rougeL_fmeasure": 0.28778567454677273, "rougeL_fmeasure_stderr": 0.0017340403544009349, "rougeL_precision": 0.28055160523303657, "rougeL_precision_stderr": 0.002274623211108698, "rougeL_recall": 0.3313213636734938, "rougeL_recall_stderr": 0.002265600276134078, "rougeLsum_fmeasure": 0.33803671939829244, "rougeLsum_fmeasure_stderr": 0.0020007958037446395, "rougeLsum_precision": 0.32911384675908034, "rougeLsum_precision_stderr": 0.0025545479162978444, "rougeLsum_recall": 0.3880780165953788, "rougeLsum_recall_stderr": 0.002578631317318316}}, "4": {"generate_text_restaurant": {"bleu": 9.257570813534436, "bleu_stderr": 0.11804490106551825, "rouge1_fmeasure": 0.41649846067482943, "rouge1_fmeasure_stderr": 0.0019741439446110845, "rouge1_precision": 0.4109702105973195, "rouge1_precision_stderr": 0.0026488822140560402, "rouge1_recall": 0.4662359820488044, "rouge1_recall_stderr": 0.0026640316823384673, "rouge2_fmeasure": 0.18523471171404504, "rouge2_fmeasure_stderr": 0.0016744088341693187, "rouge2_precision": 0.18318558454042064, "rouge2_precision_stderr": 0.0019026745915802451, "rouge2_recall": 0.20893319874685903, "rouge2_recall_stderr": 0.002033760985352723, "rougeL_fmeasure": 0.28778795947014973, "rougeL_fmeasure_stderr": 0.0017398603182013058, "rougeL_precision": 0.2855574480001324, "rougeL_precision_stderr": 0.002246940815527483, "rougeL_recall": 0.3216042735632492, "rougeL_recall_stderr": 0.002221469160798958, "rougeLsum_fmeasure": 0.34720571123378546, "rougeLsum_fmeasure_stderr": 0.001983081378364005, "rougeLsum_precision": 0.3424815041034081, "rougeLsum_precision_stderr": 0.002471533764862402, "rougeLsum_recall": 0.3890621923908457, "rougeLsum_recall_stderr": 0.002585478295796427}}, "5": {"generate_text_restaurant": {"bleu": 9.543776600805918, "bleu_stderr": 0.15953264164047723, "rouge1_fmeasure": 0.42007984633409984, "rouge1_fmeasure_stderr": 0.001942744364649238, "rouge1_precision": 0.4161542302452528, "rouge1_precision_stderr": 0.002617001414180615, "rouge1_recall": 0.46774717132291926, "rouge1_recall_stderr": 0.002656327921556222, "rouge2_fmeasure": 0.1867641974424367, "rouge2_fmeasure_stderr": 0.0016271859218098016, "rouge2_precision": 0.18528556894067952, "rouge2_precision_stderr": 0.001847588294263953, "rouge2_recall": 0.20995492634324714, "rouge2_recall_stderr": 0.002007163477696128, "rougeL_fmeasure": 0.2882813462935561, "rougeL_fmeasure_stderr": 0.0017023212878693036, "rougeL_precision": 0.28688957466117065, "rougeL_precision_stderr": 0.002194648031882239, "rougeL_recall": 0.3208800411792188, "rougeL_recall_stderr": 0.0022189945878685812, "rougeLsum_fmeasure": 0.3505047841653014, "rougeLsum_fmeasure_stderr": 0.0019506431156496053, "rougeLsum_precision": 0.34718268949684505, "rougeLsum_precision_stderr": 0.00245060355330784, "rougeLsum_recall": 0.39080744000931616, "rougeLsum_recall_stderr": 0.0025805902542978213}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6939633765858446, "bleu_stderr": 0.11775847719430056, "rouge1_fmeasure": 0.19553009306546762, "rouge1_fmeasure_stderr": 0.0024085768606284404, "rouge1_precision": 0.14445886053358095, "rouge1_precision_stderr": 0.0019706658678816376, "rouge1_recall": 0.33082612691552143, "rouge1_recall_stderr": 0.004198375153192516, "rouge2_fmeasure": 0.04098024232918615, "rouge2_fmeasure_stderr": 0.0014101889555456393, "rouge2_precision": 0.02960110619715472, "rouge2_precision_stderr": 0.0010464244114823283, "rouge2_recall": 0.07191919782669476, "rouge2_recall_stderr": 0.0025056826269908333, "rougeL_fmeasure": 0.1488848864708671, "rougeL_fmeasure_stderr": 0.0017966560545738415, "rougeL_precision": 0.10970033888652543, "rougeL_precision_stderr": 0.0014555121659556428, "rougeL_recall": 0.25323491564081496, "rougeL_recall_stderr": 0.003200163325230695, "rougeLsum_fmeasure": 0.1547519108334175, "rougeLsum_fmeasure_stderr": 0.0020142646864245565, "rougeLsum_precision": 0.11390610350834503, "rougeLsum_precision_stderr": 0.0015932376696393394, "rougeLsum_recall": 0.2630510068069291, "rougeLsum_recall_stderr": 0.0035483770604692625}}, "1": {"article_DOC_summary": {"bleu": 0.9826595853234955, "bleu_stderr": 0.06789716539628406, "rouge1_fmeasure": 0.15403004847174862, "rouge1_fmeasure_stderr": 0.0022060713968716813, "rouge1_precision": 0.1090864640319307, "rouge1_precision_stderr": 0.0016334201690781523, "rouge1_recall": 0.2727956212414536, "rouge1_recall_stderr": 0.0037947163565646433, "rouge2_fmeasure": 0.024293197626076832, "rouge2_fmeasure_stderr": 0.0011265618237352012, "rouge2_precision": 0.01703792140016959, "rouge2_precision_stderr": 0.0007950033101092582, "rouge2_recall": 0.044225907102136114, "rouge2_recall_stderr": 0.0020637317885166427, "rougeL_fmeasure": 0.1212787012676644, "rougeL_fmeasure_stderr": 0.0016835451256564055, "rougeL_precision": 0.08575057940681982, "rougeL_precision_stderr": 0.0012369096578299268, "rougeL_recall": 0.2159687660420945, "rougeL_recall_stderr": 0.003003860882616688, "rougeLsum_fmeasure": 0.12595670290650035, "rougeLsum_fmeasure_stderr": 0.001794196991080708, "rougeLsum_precision": 0.08898512877225818, "rougeLsum_precision_stderr": 0.0013134463525414503, "rougeLsum_recall": 0.22455132211561166, "rougeLsum_recall_stderr": 0.003194695634659946}}, "2": {"article_DOC_summary": {"bleu": 0.9198618354974833, "bleu_stderr": 0.07731462189355529, "rouge1_fmeasure": 0.15478174745772807, "rouge1_fmeasure_stderr": 0.0021333833587719093, "rouge1_precision": 0.10935025572516734, "rouge1_precision_stderr": 0.0015819754046859503, "rouge1_recall": 0.27580744290709325, "rouge1_recall_stderr": 0.003681414418011726, "rouge2_fmeasure": 0.024249184932784353, "rouge2_fmeasure_stderr": 0.0010688296217627906, "rouge2_precision": 0.01698251401655349, "rouge2_precision_stderr": 0.0007478487897436315, "rouge2_recall": 0.044201345395449454, "rouge2_recall_stderr": 0.0020104733146244285, "rougeL_fmeasure": 0.12395419839745862, "rougeL_fmeasure_stderr": 0.0016253749204288843, "rougeL_precision": 0.08745850505655622, "rougeL_precision_stderr": 0.001201683628083032, "rougeL_recall": 0.22178825996967086, "rougeL_recall_stderr": 0.0028772160802170463, "rougeLsum_fmeasure": 0.12533603925452638, "rougeLsum_fmeasure_stderr": 0.0017370324953664106, "rougeLsum_precision": 0.08835309869636648, "rougeLsum_precision_stderr": 0.0012715096199816893, "rougeLsum_recall": 0.2247383538285452, "rougeLsum_recall_stderr": 0.003127916430521942}}, "3": {"article_DOC_summary": {"bleu": 0.9274309124302671, "bleu_stderr": 0.07084825714611437, "rouge1_fmeasure": 0.14937130950455454, "rouge1_fmeasure_stderr": 0.002250043051674298, "rouge1_precision": 0.10785333195249253, "rouge1_precision_stderr": 0.0017654465113895702, "rouge1_recall": 0.2613472363093354, "rouge1_recall_stderr": 0.00385603951444163, "rouge2_fmeasure": 0.02217727793566935, "rouge2_fmeasure_stderr": 0.001092194342649263, "rouge2_precision": 0.015784242406608712, "rouge2_precision_stderr": 0.000804657872755561, "rouge2_recall": 0.04000495706640226, "rouge2_recall_stderr": 0.001989932733364014, "rougeL_fmeasure": 0.11795862864655915, "rougeL_fmeasure_stderr": 0.0017465741676525389, "rougeL_precision": 0.08502934925695547, "rougeL_precision_stderr": 0.0013682210127516498, "rougeL_recall": 0.20766928762009654, "rougeL_recall_stderr": 0.003107606467045228, "rougeLsum_fmeasure": 0.12163338921148069, "rougeLsum_fmeasure_stderr": 0.0018824843867231917, "rougeLsum_precision": 0.08764532413891765, "rougeLsum_precision_stderr": 0.001465938389903131, "rougeLsum_recall": 0.21433299101001138, "rougeLsum_recall_stderr": 0.0033524307536739097}}, "4": {"article_DOC_summary": {"bleu": 0.4737899246551093, "bleu_stderr": 0.09754729227103104, "rouge1_fmeasure": 0.041925864948153646, "rouge1_fmeasure_stderr": 0.0023209906119167206, "rouge1_precision": 0.03607588338971154, "rouge1_precision_stderr": 0.0022890224129137474, "rouge1_recall": 0.0668824616254433, "rouge1_recall_stderr": 0.0038073670525790716, "rouge2_fmeasure": 0.0060094374125094045, "rouge2_fmeasure_stderr": 0.000664309352146078, "rouge2_precision": 0.004529588106286902, "rouge2_precision_stderr": 0.0005037336569243048, "rouge2_recall": 0.01015954884908807, "rouge2_recall_stderr": 0.0011604371523331234, "rougeL_fmeasure": 0.033131597429885286, "rougeL_fmeasure_stderr": 0.0018239013228270276, "rougeL_precision": 0.029115361596343755, "rougeL_precision_stderr": 0.001978353536121269, "rougeL_recall": 0.052982765271586685, "rougeL_recall_stderr": 0.0030260429302117484, "rougeLsum_fmeasure": 0.034546112129003846, "rougeLsum_fmeasure_stderr": 0.0019150957263615303, "rougeLsum_precision": 0.030300991357313418, "rougeLsum_precision_stderr": 0.0020393578588917866, "rougeLsum_recall": 0.055167433802113613, "rougeLsum_recall_stderr": 0.003157148620047266}}, "5": {"article_DOC_summary": {"bleu": 3.720304006780699e-39, "bleu_stderr": 2.926181454826911e-34, "rouge1_fmeasure": 0.002378685432139298, "rouge1_fmeasure_stderr": 0.0006567741499410293, "rouge1_precision": 0.002655029617395994, "rouge1_precision_stderr": 0.0007483667011227483, "rouge1_recall": 0.0022367452273988025, "rouge1_recall_stderr": 0.0006119766398960414, "rouge2_fmeasure": 0.0001717122214549316, "rouge2_fmeasure_stderr": 8.607684989904236e-05, "rouge2_precision": 0.00022248607847941918, "rouge2_precision_stderr": 0.00011139226881842093, "rouge2_recall": 0.00014080259363278232, "rouge2_recall_stderr": 7.091720968728075e-05, "rougeL_fmeasure": 0.0017942745613513307, "rougeL_fmeasure_stderr": 0.00047540196470043436, "rougeL_precision": 0.0020092087119220714, "rougeL_precision_stderr": 0.0005492118505201973, "rougeL_recall": 0.001690787531665866, "rougeL_recall_stderr": 0.00044210668694483695, "rougeLsum_fmeasure": 0.002075971659086164, "rougeLsum_fmeasure_stderr": 0.0005619106032809446, "rougeLsum_precision": 0.002309004083713471, "rougeLsum_precision_stderr": 0.0006382012648138263, "rougeLsum_recall": 0.001965037216292909, "rougeLsum_recall_stderr": 0.0005307483457326771}}}} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_0.csv b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..695cacd0227343ad4807ef40141dd9cdff91458b --- /dev/null +++ b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.01491084616422986,0 +anli_r2,acc,0.333,0.014910846164229856,0 +anli_r3,acc,0.3408333333333333,0.013688600793296937,0 +arc_challenge,acc,0.23890784982935154,0.012461071376316614,0 +arc_challenge,acc_norm,0.2551194539249147,0.012739038695202104,0 +arc_easy,acc,0.5484006734006734,0.010211600726405224,0 +arc_easy,acc_norm,0.48863636363636365,0.010257133441117113,0 +boolq,acc,0.5504587155963303,0.008700409761350801,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.3618080033174373,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.4192391953794065,0.004924261467934419,0 +hellaswag,acc_norm,0.5364469229237204,0.004976507121076277,0 +piqa,acc,0.735582154515778,0.010289787244767166,0 +piqa,acc_norm,0.7388465723612623,0.010248738649935588,0 +rte,acc,0.5776173285198556,0.029731622646495887,0 +sciq,acc,0.796,0.012749374359024375,0 +sciq,acc_norm,0.719,0.014221154708434937,0 +storycloze_2016,acc,0.6745056119722074,0.010835369677013445,0 +winogrande,acc,0.5359116022099447,0.014016193433958308,0 diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json deleted file mode 100644 index afd9df70ee3ab002ffa0b37acc47d1eb17108a6b..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_0_lm-eval_global_step52452_2023-02-13-10-25-20_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.01491084616422986 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229856 - }, - "anli_r3": { - "acc": 0.3408333333333333, - "acc_stderr": 0.013688600793296937 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.3618080033174373 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.4192391953794065, - "acc_stderr": 0.004924261467934419, - "acc_norm": 0.5364469229237204, - "acc_norm_stderr": 0.004976507121076277 - }, - "rte": { - "acc": 0.5776173285198556, - "acc_stderr": 0.029731622646495887 - }, - "winogrande": { - "acc": 0.5359116022099447, - "acc_stderr": 0.014016193433958308 - }, - "storycloze_2016": { - "acc": 0.6745056119722074, - "acc_stderr": 0.010835369677013445 - }, - "boolq": { - "acc": 0.5504587155963303, - "acc_stderr": 0.008700409761350801 - }, - "arc_easy": { - "acc": 0.5484006734006734, - "acc_stderr": 0.010211600726405224, - "acc_norm": 0.48863636363636365, - "acc_norm_stderr": 0.010257133441117113 - }, - "arc_challenge": { - "acc": 0.23890784982935154, - "acc_stderr": 0.012461071376316614, - "acc_norm": 0.2551194539249147, - "acc_norm_stderr": 0.012739038695202104 - }, - "sciq": { - "acc": 0.796, - "acc_stderr": 0.012749374359024375, - "acc_norm": 0.719, - "acc_norm_stderr": 0.014221154708434937 - }, - "piqa": { - "acc": 0.735582154515778, - "acc_stderr": 0.010289787244767166, - "acc_norm": 0.7388465723612623, - "acc_norm_stderr": 0.010248738649935588 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_1.csv b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..4a7a19fd3ff2643bd5a5643f2576aa1d654b945c --- /dev/null +++ b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738859,0 +anli_r2,acc,0.33,0.014876872027456736,0 +anli_r3,acc,0.3275,0.013553211167251944,0 +arc_challenge,acc,0.2431740614334471,0.012536554144587092,0 +arc_challenge,acc_norm,0.2687713310580205,0.01295506596371069,0 +arc_easy,acc,0.5673400673400674,0.010166307932642865,0 +arc_easy,acc_norm,0.5155723905723906,0.010254806331961899,0 +boolq,acc,0.5740061162079511,0.008648732832949141,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.22999222999222999,,1 +copa,acc,0.68,0.04688261722621504,0 +hellaswag,acc,0.41734714200358497,0.004921133864931889,0 +hellaswag,acc_norm,0.5404301931886079,0.004973442060741633,0 +piqa,acc,0.7274211099020674,0.01038925680329602,0 +piqa,acc_norm,0.7328618063112078,0.010323440492612418,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.838,0.011657267771304415,0 +sciq,acc_norm,0.788,0.012931481864938048,0 +storycloze_2016,acc,0.6654195617316943,0.01091131896712794,0 +winogrande,acc,0.5374901341752171,0.014012928183336578,0 diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json deleted file mode 100644 index 30a948b96b0784c781b7a55103c0880fec824f2e..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_1_lm-eval_global_step52452_2023-02-13-10-25-19_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738859 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.014876872027456736 - }, - "anli_r3": { - "acc": 0.3275, - "acc_stderr": 0.013553211167251944 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.22999222999222999 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621504 - }, - "hellaswag": { - "acc": 0.41734714200358497, - "acc_stderr": 0.004921133864931889, - "acc_norm": 0.5404301931886079, - "acc_norm_stderr": 0.004973442060741633 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5374901341752171, - "acc_stderr": 0.014012928183336578 - }, - "storycloze_2016": { - "acc": 0.6654195617316943, - "acc_stderr": 0.01091131896712794 - }, - "boolq": { - "acc": 0.5740061162079511, - "acc_stderr": 0.008648732832949141 - }, - "arc_easy": { - "acc": 0.5673400673400674, - "acc_stderr": 0.010166307932642865, - "acc_norm": 0.5155723905723906, - "acc_norm_stderr": 0.010254806331961899 - }, - "arc_challenge": { - "acc": 0.2431740614334471, - "acc_stderr": 0.012536554144587092, - "acc_norm": 0.2687713310580205, - "acc_norm_stderr": 0.01295506596371069 - }, - "sciq": { - "acc": 0.838, - "acc_stderr": 0.011657267771304415, - "acc_norm": 0.788, - "acc_norm_stderr": 0.012931481864938048 - }, - "piqa": { - "acc": 0.7274211099020674, - "acc_stderr": 0.01038925680329602, - "acc_norm": 0.7328618063112078, - "acc_norm_stderr": 0.010323440492612418 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_2.csv b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..f3d25903cdf909615f3f4637871eb0f846f4c4f4 --- /dev/null +++ b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.305,0.014566646394664404,0 +anli_r2,acc,0.332,0.014899597242811488,0 +anli_r3,acc,0.3283333333333333,0.013562032919529026,0 +arc_challenge,acc,0.24829351535836178,0.012624912868089755,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0 +arc_easy,acc,0.5601851851851852,0.010185185185185313,0 +arc_easy,acc_norm,0.5391414141414141,0.010228298200766128,0 +boolq,acc,0.5697247706422018,0.008659608602932495,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.23913043478260868,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.41495717984465247,0.004917076726623791,0 +hellaswag,acc_norm,0.536247759410476,0.004976651989757641,0 +piqa,acc,0.7290533188248096,0.010369718937426843,0 +piqa,acc_norm,0.7268770402611534,0.010395730264453258,0 +rte,acc,0.5342960288808665,0.03002557981936643,0 +sciq,acc,0.845,0.011450157470799471,0 +sciq,acc_norm,0.821,0.012128730605719116,0 +storycloze_2016,acc,0.669695350080171,0.010876149841754851,0 +winogrande,acc,0.5422257300710339,0.014002284504422438,0 diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json deleted file mode 100644 index 7d9fb0289666b50e65695065124b7193d10517a6..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_2_lm-eval_global_step52452_2023-02-13-10-25-20_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.305, - "acc_stderr": 0.014566646394664404 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811488 - }, - "anli_r3": { - "acc": 0.3283333333333333, - "acc_stderr": 0.013562032919529026 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.23913043478260868 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.41495717984465247, - "acc_stderr": 0.004917076726623791, - "acc_norm": 0.536247759410476, - "acc_norm_stderr": 0.004976651989757641 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.03002557981936643 - }, - "winogrande": { - "acc": 0.5422257300710339, - "acc_stderr": 0.014002284504422438 - }, - "storycloze_2016": { - "acc": 0.669695350080171, - "acc_stderr": 0.010876149841754851 - }, - "boolq": { - "acc": 0.5697247706422018, - "acc_stderr": 0.008659608602932495 - }, - "arc_easy": { - "acc": 0.5601851851851852, - "acc_stderr": 0.010185185185185313, - "acc_norm": 0.5391414141414141, - "acc_norm_stderr": 0.010228298200766128 - }, - "arc_challenge": { - "acc": 0.24829351535836178, - "acc_stderr": 0.012624912868089755, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252425 - }, - "sciq": { - "acc": 0.845, - "acc_stderr": 0.011450157470799471, - "acc_norm": 0.821, - "acc_norm_stderr": 0.012128730605719116 - }, - "piqa": { - "acc": 0.7290533188248096, - "acc_stderr": 0.010369718937426843, - "acc_norm": 0.7268770402611534, - "acc_norm_stderr": 0.010395730264453258 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_3.csv b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..860b80170629068159a040158a2a318f1bdac532 --- /dev/null +++ b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057121,0 +anli_r2,acc,0.319,0.01474640486547349,0 +anli_r3,acc,0.34,0.013680495725767785,0 +arc_challenge,acc,0.25170648464163825,0.012682496334042961,0 +arc_challenge,acc_norm,0.26791808873720135,0.012942030195136432,0 +arc_easy,acc,0.5643939393939394,0.010174341733665226,0 +arc_easy,acc_norm,0.5404040404040404,0.010226230740889025,0 +boolq,acc,0.5559633027522936,0.008690105214920793,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.17117117117117117,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.41884086835291773,0.004923609207861546,0 +hellaswag,acc_norm,0.5448117904799841,0.00496970108106838,0 +piqa,acc,0.7317736670293797,0.010336761992404483,0 +piqa,acc_norm,0.7295973884657236,0.010363167031620778,0 +rte,acc,0.555956678700361,0.029907396333795994,0 +sciq,acc,0.844,0.01148023500612236,0 +sciq,acc_norm,0.83,0.011884495834541656,0 +storycloze_2016,acc,0.6680919294494923,0.010889456787175313,0 +winogrande,acc,0.5493291239147593,0.013983928869040237,0 diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json deleted file mode 100644 index 07fba337c5dc0ba691ef6993905bcbe4a4848a25..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057121 - }, - "anli_r2": { - "acc": 0.319, - "acc_stderr": 0.01474640486547349 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767785 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.17117117117117117 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.41884086835291773, - "acc_stderr": 0.004923609207861546, - "acc_norm": 0.5448117904799841, - "acc_norm_stderr": 0.00496970108106838 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795994 - }, - "winogrande": { - "acc": 0.5493291239147593, - "acc_stderr": 0.013983928869040237 - }, - "storycloze_2016": { - "acc": 0.6680919294494923, - "acc_stderr": 0.010889456787175313 - }, - "boolq": { - "acc": 0.5559633027522936, - "acc_stderr": 0.008690105214920793 - }, - "arc_easy": { - "acc": 0.5643939393939394, - "acc_stderr": 0.010174341733665226, - "acc_norm": 0.5404040404040404, - "acc_norm_stderr": 0.010226230740889025 - }, - "arc_challenge": { - "acc": 0.25170648464163825, - "acc_stderr": 0.012682496334042961, - "acc_norm": 0.26791808873720135, - "acc_norm_stderr": 0.012942030195136432 - }, - "sciq": { - "acc": 0.844, - "acc_stderr": 0.01148023500612236, - "acc_norm": 0.83, - "acc_norm_stderr": 0.011884495834541656 - }, - "piqa": { - "acc": 0.7317736670293797, - "acc_stderr": 0.010336761992404483, - "acc_norm": 0.7295973884657236, - "acc_norm_stderr": 0.010363167031620778 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_4.csv b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..a68b0dd5e05fd027bd0bae246c7732fbbf74c798 --- /dev/null +++ b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928367,0 +anli_r2,acc,0.329,0.014865395385928359,0 +anli_r3,acc,0.3358333333333333,0.013639261190932882,0 +arc_challenge,acc,0.25426621160409557,0.012724999945157744,0 +arc_challenge,acc_norm,0.27474402730375425,0.013044617212771227,0 +arc_easy,acc,0.5686026936026936,0.01016275284774751,0 +arc_easy,acc_norm,0.5526094276094277,0.010202832385415642,0 +boolq,acc,0.5584097859327217,0.008685178933161664,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.22212270488132557,,1 +copa,acc,0.74,0.04408440022768078,0 +hellaswag,acc,0.41655048795060745,0.004919794704673265,0 +hellaswag,acc_norm,0.5428201553475404,0.004971449552787172,0 +piqa,acc,0.7323177366702938,0.01033011118937043,0 +piqa,acc_norm,0.7317736670293797,0.010336761992404485,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.858,0.011043457699378227,0 +sciq,acc_norm,0.826,0.01199449323097343,0 +storycloze_2016,acc,0.6691608765366115,0.010880601338204659,0 +winogrande,acc,0.5193370165745856,0.014041972733712979,0 diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json deleted file mode 100644 index 3607f39a73bc87061c2d15195a1a84879f34ef04..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_4_lm-eval_global_step52452_2023-02-13-10-25-20_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928367 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928359 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932882 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.22212270488132557 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768078 - }, - "hellaswag": { - "acc": 0.41655048795060745, - "acc_stderr": 0.004919794704673265, - "acc_norm": 0.5428201553475404, - "acc_norm_stderr": 0.004971449552787172 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5193370165745856, - "acc_stderr": 0.014041972733712979 - }, - "storycloze_2016": { - "acc": 0.6691608765366115, - "acc_stderr": 0.010880601338204659 - }, - "boolq": { - "acc": 0.5584097859327217, - "acc_stderr": 0.008685178933161664 - }, - "arc_easy": { - "acc": 0.5686026936026936, - "acc_stderr": 0.01016275284774751, - "acc_norm": 0.5526094276094277, - "acc_norm_stderr": 0.010202832385415642 - }, - "arc_challenge": { - "acc": 0.25426621160409557, - "acc_stderr": 0.012724999945157744, - "acc_norm": 0.27474402730375425, - "acc_norm_stderr": 0.013044617212771227 - }, - "sciq": { - "acc": 0.858, - "acc_stderr": 0.011043457699378227, - "acc_norm": 0.826, - "acc_norm_stderr": 0.01199449323097343 - }, - "piqa": { - "acc": 0.7323177366702938, - "acc_stderr": 0.01033011118937043, - "acc_norm": 0.7317736670293797, - "acc_norm_stderr": 0.010336761992404485 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_5.csv b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..99d3f1d9261b499f38d365b09adb4a0b68c7768b --- /dev/null +++ b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229868,0 +anli_r2,acc,0.321,0.014770821817934642,0 +anli_r3,acc,0.3441666666666667,0.013720551062295756,0 +arc_challenge,acc,0.25426621160409557,0.012724999945157744,0 +arc_challenge,acc_norm,0.2815699658703072,0.013143376735009026,0 +arc_easy,acc,0.5656565656565656,0.010170943451269421,0 +arc_easy,acc_norm,0.5572390572390572,0.010192333348394459,0 +boolq,acc,0.5660550458715596,0.008668405003744129,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.21956970232832299,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.418442541326429,0.004922953651577692,0 +hellaswag,acc_norm,0.54690300736905,0.0049677789400119285,0 +piqa,acc,0.720892274211099,0.010465657948498228,0 +piqa,acc_norm,0.7285092491838956,0.010376251176596137,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.853,0.011203415395160336,0 +sciq,acc_norm,0.85,0.011297239823409296,0 +storycloze_2016,acc,0.6675574559059326,0.010893860778343542,0 +winogrande,acc,0.5422257300710339,0.014002284504422435,0 diff --git a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json b/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json deleted file mode 100644 index 2f24fc22025ccd961957812a327ce893a94f2d66..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed2/evaluation/rankeval/2b855b4bc4seed2_5_lm-eval_global_step52452_2023-02-13-10-25-20_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229868 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.014770821817934642 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295756 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.21956970232832299 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.418442541326429, - "acc_stderr": 0.004922953651577692, - "acc_norm": 0.54690300736905, - "acc_norm_stderr": 0.0049677789400119285 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5422257300710339, - "acc_stderr": 0.014002284504422435 - }, - "storycloze_2016": { - "acc": 0.6675574559059326, - "acc_stderr": 0.010893860778343542 - }, - "boolq": { - "acc": 0.5660550458715596, - "acc_stderr": 0.008668405003744129 - }, - "arc_easy": { - "acc": 0.5656565656565656, - "acc_stderr": 0.010170943451269421, - "acc_norm": 0.5572390572390572, - "acc_norm_stderr": 0.010192333348394459 - }, - "arc_challenge": { - "acc": 0.25426621160409557, - "acc_stderr": 0.012724999945157744, - "acc_norm": 0.2815699658703072, - "acc_norm_stderr": 0.013143376735009026 - }, - "sciq": { - "acc": 0.853, - "acc_stderr": 0.011203415395160336, - "acc_norm": 0.85, - "acc_norm_stderr": 0.011297239823409296 - }, - "piqa": { - "acc": 0.720892274211099, - "acc_stderr": 0.010465657948498228, - "acc_norm": 0.7285092491838956, - "acc_norm_stderr": 0.010376251176596137 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/generation/merged.csv b/2b855b4bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..38bae63892fa56a38c532151c2fd6747bc2320ea --- /dev/null +++ b/2b855b4bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.02609365721462583 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.02609365721462583 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17691313213489698 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17691313213489698 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19767759513416522 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.19767759513416522 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19908966260019945 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19908966260019945 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.19942804280342485 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.19942804280342485 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19867021572522917 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19867021572522917 +e2e_nlg_cleaned,5,average,multiple,0.1663120509354236 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04284406630002153 +gem_xsum,0,median,rouge2_fmeasure,0.04284406630002153 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.027241156975209957 +gem_xsum,1,median,rouge2_fmeasure,0.027241156975209957 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.02759041092903475 +gem_xsum,2,median,rouge2_fmeasure,0.02759041092903475 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.028243845157073965 +gem_xsum,3,median,rouge2_fmeasure,0.028243845157073965 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.006625288687720446 +gem_xsum,4,median,rouge2_fmeasure,0.006625288687720446 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,4.083966348117292e-05 +gem_xsum,5,median,rouge2_fmeasure,4.083966348117292e-05 +gem_xsum,5,average,multiple,0.022097601285423638 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05163800526728178 +web_nlg_en,0,median,rouge2_fmeasure,0.05163800526728178 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.050306297117355465 +web_nlg_en,1,median,rouge2_fmeasure,0.050306297117355465 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.051035302851707 +web_nlg_en,2,median,rouge2_fmeasure,0.051035302851707 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05068243900070087 +web_nlg_en,3,median,rouge2_fmeasure,0.05068243900070087 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.051153679368359566 +web_nlg_en,4,median,rouge2_fmeasure,0.051153679368359566 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05265210125461001 +web_nlg_en,5,median,rouge2_fmeasure,0.05265210125461001 +web_nlg_en,5,average,multiple,0.051244637476669117 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.035905575078758745 +wiki_lingua_en,0,median,rouge2_fmeasure,0.035905575078758745 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.035621584152366104 +wiki_lingua_en,1,median,rouge2_fmeasure,0.035621584152366104 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04182590324110814 +wiki_lingua_en,2,median,rouge2_fmeasure,0.04182590324110814 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03738404170670319 +wiki_lingua_en,3,median,rouge2_fmeasure,0.03738404170670319 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012293895701182715 +wiki_lingua_en,4,median,rouge2_fmeasure,0.012293895701182715 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.001927917866254743 +wiki_lingua_en,5,median,rouge2_fmeasure,0.001927917866254743 +wiki_lingua_en,5,average,multiple,0.027493152957728938 diff --git a/2b855b4bc4seed3/evaluation/generation/merged.json b/2b855b4bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..2fd0b194919fb8ec196cca2a033262ba3bfd934a --- /dev/null +++ b/2b855b4bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.34149794973297676, "bleu_stderr": 0.024070843348789728, "rouge1_fmeasure": 0.1096206882907206, "rouge1_fmeasure_stderr": 0.0021032653721366847, "rouge1_precision": 0.07378966069848865, "rouge1_precision_stderr": 0.0017651819835916797, "rouge1_recall": 0.3052093327427227, "rouge1_recall_stderr": 0.004659524236835493, "rouge2_fmeasure": 0.05163800526728178, "rouge2_fmeasure_stderr": 0.0013219986026484292, "rouge2_precision": 0.034474538483852506, "rouge2_precision_stderr": 0.0010660195680270341, "rouge2_recall": 0.14548848993244087, "rouge2_recall_stderr": 0.003124645184585433, "rougeL_fmeasure": 0.10472065071810575, "rougeL_fmeasure_stderr": 0.0019332303421715103, "rougeL_precision": 0.07017746146795226, "rougeL_precision_stderr": 0.0016122484526565823, "rougeL_recall": 0.29481573900127955, "rougeL_recall_stderr": 0.004527972156547588, "rougeLsum_fmeasure": 0.10417528413829004, "rougeLsum_fmeasure_stderr": 0.0019666759386625056, "rougeLsum_precision": 0.0700995753938997, "rougeLsum_precision_stderr": 0.001661073056353709, "rougeLsum_recall": 0.28981525868277624, "rougeLsum_recall_stderr": 0.004354059423193383}}, "1": {"PALM_prompt": {"bleu": 0.41285407770135396, "bleu_stderr": 0.01754790811261284, "rouge1_fmeasure": 0.10997132734799935, "rouge1_fmeasure_stderr": 0.0019284940397050105, "rouge1_precision": 0.07104522218873542, "rouge1_precision_stderr": 0.0014806365737847472, "rouge1_recall": 0.34953794126791987, "rouge1_recall_stderr": 0.005235141680101085, "rouge2_fmeasure": 0.050306297117355465, "rouge2_fmeasure_stderr": 0.00119042729298282, "rouge2_precision": 0.03251202410097206, "rouge2_precision_stderr": 0.0009116920375298458, "rouge2_recall": 0.16398144553532898, "rouge2_recall_stderr": 0.003418128791041732, "rougeL_fmeasure": 0.10279990101949857, "rougeL_fmeasure_stderr": 0.0017245999730528924, "rougeL_precision": 0.06622972258356442, "rougeL_precision_stderr": 0.0013065290597614545, "rougeL_recall": 0.32780368593115344, "rougeL_recall_stderr": 0.004802505791385796, "rougeLsum_fmeasure": 0.10432538032364451, "rougeLsum_fmeasure_stderr": 0.0018116659522039802, "rougeLsum_precision": 0.06743066797886756, "rougeLsum_precision_stderr": 0.0013807345112292052, "rougeLsum_recall": 0.329753583736618, "rougeLsum_recall_stderr": 0.004769654541166196}}, "2": {"PALM_prompt": {"bleu": 0.4278049171298478, "bleu_stderr": 0.03479156413335414, "rouge1_fmeasure": 0.11315700001035954, "rouge1_fmeasure_stderr": 0.0019275832616273293, "rouge1_precision": 0.07263639445378779, "rouge1_precision_stderr": 0.0014353363731377956, "rouge1_recall": 0.3580445153586937, "rouge1_recall_stderr": 0.005074045684879914, "rouge2_fmeasure": 0.051035302851707, "rouge2_fmeasure_stderr": 0.001182755281479741, "rouge2_precision": 0.03259050316742188, "rouge2_precision_stderr": 0.0008444845748636126, "rouge2_recall": 0.1685765920634511, "rouge2_recall_stderr": 0.0034778310380469708, "rougeL_fmeasure": 0.10480743078361904, "rougeL_fmeasure_stderr": 0.001698665758313696, "rougeL_precision": 0.06713584196058077, "rougeL_precision_stderr": 0.001258671786156594, "rougeL_recall": 0.3332075972115363, "rougeL_recall_stderr": 0.004616557453883418, "rougeLsum_fmeasure": 0.10741033325940673, "rougeLsum_fmeasure_stderr": 0.0017953637918930417, "rougeLsum_precision": 0.06893935418358593, "rougeLsum_precision_stderr": 0.0013395205856225247, "rougeLsum_recall": 0.3398269694851392, "rougeLsum_recall_stderr": 0.004715674267986058}}, "3": {"PALM_prompt": {"bleu": 0.46941060436426824, "bleu_stderr": 0.02350508220521167, "rouge1_fmeasure": 0.11131094885031433, "rouge1_fmeasure_stderr": 0.0018552372383416945, "rouge1_precision": 0.07120441350320453, "rouge1_precision_stderr": 0.001374826448255674, "rouge1_recall": 0.3547998445826579, "rouge1_recall_stderr": 0.004935088834659092, "rouge2_fmeasure": 0.05068243900070087, "rouge2_fmeasure_stderr": 0.0011766392275573737, "rouge2_precision": 0.03230025514419665, "rouge2_precision_stderr": 0.0008374703196116828, "rouge2_recall": 0.16941835077334486, "rouge2_recall_stderr": 0.0034569464698063984, "rougeL_fmeasure": 0.10283504484602084, "rougeL_fmeasure_stderr": 0.001667702695841441, "rougeL_precision": 0.06572925931995278, "rougeL_precision_stderr": 0.001231045883113472, "rougeL_recall": 0.3279364049446863, "rougeL_recall_stderr": 0.004469640439180389, "rougeLsum_fmeasure": 0.10552850002852111, "rougeLsum_fmeasure_stderr": 0.0017453992137976785, "rougeLsum_precision": 0.06751520950054614, "rougeLsum_precision_stderr": 0.0012937415786685453, "rougeLsum_recall": 0.3348924038621722, "rougeLsum_recall_stderr": 0.004530414393472706}}, "4": {"PALM_prompt": {"bleu": 0.49231845291011167, "bleu_stderr": 0.02033666078442315, "rouge1_fmeasure": 0.11237732143262441, "rouge1_fmeasure_stderr": 0.0017932111214244764, "rouge1_precision": 0.07147573750121046, "rouge1_precision_stderr": 0.0013220739422350534, "rouge1_recall": 0.36380200151009223, "rouge1_recall_stderr": 0.005011962111617361, "rouge2_fmeasure": 0.051153679368359566, "rouge2_fmeasure_stderr": 0.0011201500694600927, "rouge2_precision": 0.03236462324247146, "rouge2_precision_stderr": 0.00079321491317793, "rouge2_recall": 0.176366711287249, "rouge2_recall_stderr": 0.0035023272922778893, "rougeL_fmeasure": 0.10388691313533473, "rougeL_fmeasure_stderr": 0.001598354278889633, "rougeL_precision": 0.06599559048573157, "rougeL_precision_stderr": 0.0011692496534413568, "rougeL_recall": 0.3365228809934532, "rougeL_recall_stderr": 0.004509950713480321, "rougeLsum_fmeasure": 0.10698650246231642, "rougeLsum_fmeasure_stderr": 0.0016815909197073864, "rougeLsum_precision": 0.06804259798371044, "rougeLsum_precision_stderr": 0.0012389830245728504, "rougeLsum_recall": 0.3454059950238982, "rougeLsum_recall_stderr": 0.004629916939784921}}, "5": {"PALM_prompt": {"bleu": 0.5202463706599937, "bleu_stderr": 0.029811069639771856, "rouge1_fmeasure": 0.1144399470934002, "rouge1_fmeasure_stderr": 0.0018450236144596445, "rouge1_precision": 0.07258813810659684, "rouge1_precision_stderr": 0.001361537139827381, "rouge1_recall": 0.36814201390251083, "rouge1_recall_stderr": 0.0049163815982217664, "rouge2_fmeasure": 0.05265210125461001, "rouge2_fmeasure_stderr": 0.001175915999194332, "rouge2_precision": 0.03329255584053826, "rouge2_precision_stderr": 0.0008388022199937085, "rouge2_recall": 0.18030364466302604, "rouge2_recall_stderr": 0.00359328286978326, "rougeL_fmeasure": 0.10579769313166237, "rougeL_fmeasure_stderr": 0.0016423622256271481, "rougeL_precision": 0.06706145771307512, "rougeL_precision_stderr": 0.0012086953689654937, "rougeL_recall": 0.34086907104199693, "rougeL_recall_stderr": 0.004424353746142977, "rougeLsum_fmeasure": 0.10846793087142063, "rougeLsum_fmeasure_stderr": 0.0017216057434839122, "rougeLsum_precision": 0.06879877428316417, "rougeLsum_precision_stderr": 0.0012706898662681679, "rougeLsum_recall": 0.3490578325381026, "rougeLsum_recall_stderr": 0.004566352334154941}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5261588796858137, "bleu_stderr": 0.051539941294178646, "rouge1_fmeasure": 0.17769437564935575, "rouge1_fmeasure_stderr": 0.0017808060621968284, "rouge1_precision": 0.15287345505067326, "rouge1_precision_stderr": 0.0018660449048136238, "rouge1_recall": 0.2568159492678247, "rouge1_recall_stderr": 0.0025435227755146754, "rouge2_fmeasure": 0.035905575078758745, "rouge2_fmeasure_stderr": 0.0008252184218331057, "rouge2_precision": 0.0307314317769995, "rouge2_precision_stderr": 0.0007399321880831138, "rouge2_recall": 0.05399339040546112, "rouge2_recall_stderr": 0.0014079643129843208, "rougeL_fmeasure": 0.14077868507206634, "rougeL_fmeasure_stderr": 0.0012904898389564119, "rougeL_precision": 0.11971263337255637, "rougeL_precision_stderr": 0.00132482642422352, "rougeL_recall": 0.20833002069504247, "rougeL_recall_stderr": 0.002117816513708951, "rougeLsum_fmeasure": 0.16361955605052478, "rougeLsum_fmeasure_stderr": 0.001629153891693766, "rougeLsum_precision": 0.14057536578447416, "rougeLsum_precision_stderr": 0.0017066864254147629, "rougeLsum_recall": 0.23737545212851918, "rougeLsum_recall_stderr": 0.0023677431728314755}}, "1": {"tldr_en": {"bleu": 1.823019960933038, "bleu_stderr": 0.05973791030825484, "rouge1_fmeasure": 0.18075325013552948, "rouge1_fmeasure_stderr": 0.0018077753887983314, "rouge1_precision": 0.1550248405642116, "rouge1_precision_stderr": 0.001896906655916291, "rouge1_recall": 0.26218064341281055, "rouge1_recall_stderr": 0.0025909975018623467, "rouge2_fmeasure": 0.035621584152366104, "rouge2_fmeasure_stderr": 0.0008398665519360502, "rouge2_precision": 0.030566268511602344, "rouge2_precision_stderr": 0.0007838403852034119, "rouge2_recall": 0.05337110142969444, "rouge2_recall_stderr": 0.0014076030736419759, "rougeL_fmeasure": 0.13291690754472102, "rougeL_fmeasure_stderr": 0.001223963436118407, "rougeL_precision": 0.11273422340111668, "rougeL_precision_stderr": 0.0012663923687857362, "rougeL_recall": 0.19777032997286517, "rougeL_recall_stderr": 0.0020307703717816245, "rougeLsum_fmeasure": 0.16958664100384446, "rougeLsum_fmeasure_stderr": 0.0016748737051982695, "rougeLsum_precision": 0.1452503262898813, "rougeLsum_precision_stderr": 0.0017575394416805713, "rougeLsum_recall": 0.2467604576066894, "rougeLsum_recall_stderr": 0.0024368623092909268}}, "2": {"tldr_en": {"bleu": 2.1643875138687165, "bleu_stderr": 0.056220230120082204, "rouge1_fmeasure": 0.19446372326563105, "rouge1_fmeasure_stderr": 0.0018300062010547177, "rouge1_precision": 0.16609426045522735, "rouge1_precision_stderr": 0.0019616065481029557, "rouge1_recall": 0.28378085558545096, "rouge1_recall_stderr": 0.002580200456036555, "rouge2_fmeasure": 0.04182590324110814, "rouge2_fmeasure_stderr": 0.000885657370806394, "rouge2_precision": 0.03562732889383582, "rouge2_precision_stderr": 0.0008114993883591799, "rouge2_recall": 0.06311157313124471, "rouge2_recall_stderr": 0.0014858352617254111, "rougeL_fmeasure": 0.14383271792331606, "rougeL_fmeasure_stderr": 0.0012464448514675746, "rougeL_precision": 0.1213496694210961, "rougeL_precision_stderr": 0.001312597801505532, "rougeL_recall": 0.21557747368665503, "rougeL_recall_stderr": 0.00206346826263494, "rougeLsum_fmeasure": 0.18247130831026273, "rougeLsum_fmeasure_stderr": 0.0017139002799806218, "rougeLsum_precision": 0.15566323067848228, "rougeLsum_precision_stderr": 0.0018351802577195, "rougeLsum_recall": 0.267085025843511, "rougeLsum_recall_stderr": 0.002463044867338434}}, "3": {"tldr_en": {"bleu": 2.300863526074219, "bleu_stderr": 0.07065742987598841, "rouge1_fmeasure": 0.16867270963398673, "rouge1_fmeasure_stderr": 0.0021319646118587686, "rouge1_precision": 0.1491556698566765, "rouge1_precision_stderr": 0.0022775659672613522, "rouge1_recall": 0.24522892927354162, "rouge1_recall_stderr": 0.0030988596130487505, "rouge2_fmeasure": 0.03738404170670319, "rouge2_fmeasure_stderr": 0.0008832725798383575, "rouge2_precision": 0.03261718042945885, "rouge2_precision_stderr": 0.0008506379612495235, "rouge2_recall": 0.05686498659646707, "rouge2_recall_stderr": 0.0015469706436568246, "rougeL_fmeasure": 0.12428675427356456, "rougeL_fmeasure_stderr": 0.0014927320729562039, "rougeL_precision": 0.10930983027179599, "rougeL_precision_stderr": 0.0016530791704382865, "rougeL_recall": 0.18566837950611784, "rougeL_recall_stderr": 0.0024476905555294998, "rougeLsum_fmeasure": 0.15794839427111632, "rougeLsum_fmeasure_stderr": 0.001982128997367088, "rougeLsum_precision": 0.13951873305134105, "rougeLsum_precision_stderr": 0.0021296141002242358, "rougeLsum_recall": 0.2306796487515666, "rougeLsum_recall_stderr": 0.0029363659057151732}}, "4": {"tldr_en": {"bleu": 0.526157980235587, "bleu_stderr": 0.035893694821752094, "rouge1_fmeasure": 0.05482228440148745, "rouge1_fmeasure_stderr": 0.0018411210777513375, "rouge1_precision": 0.04962574104721049, "rouge1_precision_stderr": 0.0018198600753817529, "rouge1_recall": 0.08309074877432944, "rouge1_recall_stderr": 0.0028287198414467443, "rouge2_fmeasure": 0.012293895701182715, "rouge2_fmeasure_stderr": 0.0006120954626321466, "rouge2_precision": 0.01090079803758114, "rouge2_precision_stderr": 0.0006444678926744452, "rouge2_recall": 0.019812592799815287, "rouge2_recall_stderr": 0.0011029993228413686, "rougeL_fmeasure": 0.041592184904605284, "rougeL_fmeasure_stderr": 0.0013709027308235946, "rougeL_precision": 0.037784268536837355, "rougeL_precision_stderr": 0.0013979790292500702, "rougeL_recall": 0.06439200423863728, "rougeL_recall_stderr": 0.0022221319438930445, "rougeLsum_fmeasure": 0.05126045406771062, "rougeLsum_fmeasure_stderr": 0.001717949102944444, "rougeLsum_precision": 0.046396981275511374, "rougeLsum_precision_stderr": 0.001704814967730543, "rougeLsum_recall": 0.07788609616568212, "rougeLsum_recall_stderr": 0.0026562895636961918}}, "5": {"tldr_en": {"bleu": 6.769985230041797e-07, "bleu_stderr": 1.2586843819060696e-06, "rouge1_fmeasure": 0.008595908139301592, "rouge1_fmeasure_stderr": 0.0008073759124959728, "rouge1_precision": 0.008084628719376088, "rouge1_precision_stderr": 0.0008292009587921892, "rouge1_recall": 0.013422996225543295, "rouge1_recall_stderr": 0.0012941094098939123, "rouge2_fmeasure": 0.001927917866254743, "rouge2_fmeasure_stderr": 0.0002442725839215325, "rouge2_precision": 0.0016974555955458473, "rouge2_precision_stderr": 0.00023857282475562208, "rouge2_recall": 0.0033717060617100485, "rouge2_recall_stderr": 0.00048409539971276954, "rougeL_fmeasure": 0.006569130133095981, "rougeL_fmeasure_stderr": 0.0006157023515465104, "rougeL_precision": 0.006201043078048079, "rougeL_precision_stderr": 0.0006437985069880845, "rougeL_recall": 0.010479327433197483, "rougeL_recall_stderr": 0.001033938989737594, "rougeLsum_fmeasure": 0.0079780486761824, "rougeLsum_fmeasure_stderr": 0.0007499108221812356, "rougeLsum_precision": 0.007523663903430591, "rougeLsum_precision_stderr": 0.0007759720126710234, "rougeLsum_recall": 0.012430043169956264, "rougeLsum_recall_stderr": 0.001195761046149762}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.3008331337613632, "bleu_stderr": 0.02372471026082312, "rouge1_fmeasure": 0.10477670169072488, "rouge1_fmeasure_stderr": 0.001668223310604396, "rouge1_precision": 0.08248993401904099, "rouge1_precision_stderr": 0.0013803853900472352, "rouge1_recall": 0.1503419085960325, "rouge1_recall_stderr": 0.002302820189602957, "rouge2_fmeasure": 0.02609365721462583, "rouge2_fmeasure_stderr": 0.0007136384892572035, "rouge2_precision": 0.020458390817454854, "rouge2_precision_stderr": 0.0005797040018566984, "rouge2_recall": 0.037680989592197074, "rouge2_recall_stderr": 0.000995916347442491, "rougeL_fmeasure": 0.09409395300259811, "rougeL_fmeasure_stderr": 0.0014417140025738482, "rougeL_precision": 0.07378599079591229, "rougeL_precision_stderr": 0.001178667026437885, "rougeL_recall": 0.13589278125850884, "rougeL_recall_stderr": 0.002031395693088984, "rougeLsum_fmeasure": 0.0894824614287922, "rougeLsum_fmeasure_stderr": 0.0014428186280295051, "rougeLsum_precision": 0.0704021978448046, "rougeLsum_precision_stderr": 0.0011947802972828237, "rougeLsum_recall": 0.12855621958661312, "rougeLsum_recall_stderr": 0.0019853343040545446}}, "1": {"generate_text_restaurant": {"bleu": 10.085633981219681, "bleu_stderr": 0.13114687703480873, "rouge1_fmeasure": 0.40492393990177894, "rouge1_fmeasure_stderr": 0.0021686130562344114, "rouge1_precision": 0.47669277500147966, "rouge1_precision_stderr": 0.003209255486352785, "rouge1_recall": 0.3946262496830458, "rouge1_recall_stderr": 0.0027404407923539735, "rouge2_fmeasure": 0.17691313213489698, "rouge2_fmeasure_stderr": 0.0017467710048168846, "rouge2_precision": 0.2120532695089864, "rouge2_precision_stderr": 0.002391983379707285, "rouge2_recall": 0.17207100564715888, "rouge2_recall_stderr": 0.0018913128485845037, "rougeL_fmeasure": 0.2927706288415245, "rougeL_fmeasure_stderr": 0.001829290609402662, "rougeL_precision": 0.34650436594696765, "rougeL_precision_stderr": 0.0027321720938178356, "rougeL_recall": 0.28534263286674877, "rougeL_recall_stderr": 0.0022102370145310915, "rougeLsum_fmeasure": 0.33082231042235805, "rougeLsum_fmeasure_stderr": 0.002083904106554716, "rougeLsum_precision": 0.38957083415976546, "rougeLsum_precision_stderr": 0.002960477614753955, "rougeLsum_recall": 0.322590649218115, "rougeLsum_recall_stderr": 0.0025042617435344547}}, "2": {"generate_text_restaurant": {"bleu": 11.48683797480448, "bleu_stderr": 0.13689537278996403, "rouge1_fmeasure": 0.4208464469170597, "rouge1_fmeasure_stderr": 0.0021092264701600334, "rouge1_precision": 0.4942511981129798, "rouge1_precision_stderr": 0.0032660128681032607, "rouge1_recall": 0.4085028215603489, "rouge1_recall_stderr": 0.0026739155224546154, "rouge2_fmeasure": 0.19767759513416522, "rouge2_fmeasure_stderr": 0.0017539198357637792, "rouge2_precision": 0.2373507527123457, "rouge2_precision_stderr": 0.0024848652599795953, "rouge2_recall": 0.19182097685969646, "rouge2_recall_stderr": 0.001950014267790833, "rougeL_fmeasure": 0.30607374477307414, "rougeL_fmeasure_stderr": 0.0018764570305197334, "rougeL_precision": 0.3603237638386147, "rougeL_precision_stderr": 0.002800467703045716, "rougeL_recall": 0.29768188381504496, "rougeL_recall_stderr": 0.0022796801680467554, "rougeLsum_fmeasure": 0.3464174958364187, "rougeLsum_fmeasure_stderr": 0.002083524201578578, "rougeLsum_precision": 0.4068099710117287, "rougeLsum_precision_stderr": 0.003032395394767631, "rougeLsum_recall": 0.33666266443429244, "rougeLsum_recall_stderr": 0.002515792046486858}}, "3": {"generate_text_restaurant": {"bleu": 11.769317262433407, "bleu_stderr": 0.17669670054609596, "rouge1_fmeasure": 0.4221147903719478, "rouge1_fmeasure_stderr": 0.0020295332203399134, "rouge1_precision": 0.4904056510064383, "rouge1_precision_stderr": 0.0031659951961414408, "rouge1_recall": 0.41293435900278025, "rouge1_recall_stderr": 0.002630040946607861, "rouge2_fmeasure": 0.19908966260019945, "rouge2_fmeasure_stderr": 0.0016882594583074867, "rouge2_precision": 0.236199561820689, "rouge2_precision_stderr": 0.002385523373980958, "rouge2_recall": 0.19469823760811275, "rouge2_recall_stderr": 0.001901571013251424, "rougeL_fmeasure": 0.30671384165835025, "rougeL_fmeasure_stderr": 0.0017921314683741266, "rougeL_precision": 0.3564493470388228, "rougeL_precision_stderr": 0.002635880628411399, "rougeL_recall": 0.3010314923474163, "rougeL_recall_stderr": 0.002250390454096838, "rougeLsum_fmeasure": 0.34923441169872316, "rougeLsum_fmeasure_stderr": 0.002005819510690739, "rougeLsum_precision": 0.40487723059858666, "rougeLsum_precision_stderr": 0.002880598466201736, "rougeLsum_recall": 0.34227554057062715, "rougeLsum_recall_stderr": 0.002473736978450292}}, "4": {"generate_text_restaurant": {"bleu": 11.79134221382112, "bleu_stderr": 0.12535342307352249, "rouge1_fmeasure": 0.42272700237747296, "rouge1_fmeasure_stderr": 0.0020515426996600143, "rouge1_precision": 0.484106197309497, "rouge1_precision_stderr": 0.0031600467188405215, "rouge1_recall": 0.41439408270388467, "rouge1_recall_stderr": 0.0025254962523307673, "rouge2_fmeasure": 0.19942804280342485, "rouge2_fmeasure_stderr": 0.001718153664439426, "rouge2_precision": 0.23280617771885598, "rouge2_precision_stderr": 0.002378760062291859, "rouge2_recall": 0.19532277089673009, "rouge2_recall_stderr": 0.001883777494857017, "rougeL_fmeasure": 0.30361143197886953, "rougeL_fmeasure_stderr": 0.0018388468569395635, "rougeL_precision": 0.3475427077445094, "rougeL_precision_stderr": 0.0026240119823243554, "rougeL_recall": 0.2988477247393447, "rougeL_recall_stderr": 0.0022290096948513667, "rougeLsum_fmeasure": 0.3486912827093588, "rougeLsum_fmeasure_stderr": 0.0020362098882303987, "rougeLsum_precision": 0.39820571388519943, "rougeLsum_precision_stderr": 0.002872696509880383, "rougeLsum_recall": 0.3429055746792053, "rougeLsum_recall_stderr": 0.0024410385054846767}}, "5": {"generate_text_restaurant": {"bleu": 11.72418690367057, "bleu_stderr": 0.18628580707282996, "rouge1_fmeasure": 0.4215479777795988, "rouge1_fmeasure_stderr": 0.0020010443799496564, "rouge1_precision": 0.4830504778255406, "rouge1_precision_stderr": 0.0030980094676226273, "rouge1_recall": 0.4124495876019635, "rouge1_recall_stderr": 0.002483193348305745, "rouge2_fmeasure": 0.19867021572522917, "rouge2_fmeasure_stderr": 0.0016815021829648367, "rouge2_precision": 0.231717878622748, "rouge2_precision_stderr": 0.002328708645973225, "rouge2_recall": 0.1940652845730282, "rouge2_recall_stderr": 0.0018164123274361779, "rougeL_fmeasure": 0.30032232907072093, "rougeL_fmeasure_stderr": 0.0018362752859886558, "rougeL_precision": 0.34461859029467656, "rougeL_precision_stderr": 0.002633989428993704, "rougeL_recall": 0.29446488277887223, "rougeL_recall_stderr": 0.0021715077178552132, "rougeLsum_fmeasure": 0.3476723734711335, "rougeLsum_fmeasure_stderr": 0.0020299970599519913, "rougeLsum_precision": 0.39765584236310936, "rougeLsum_precision_stderr": 0.0028640415949561264, "rougeLsum_recall": 0.34087939285197333, "rougeLsum_recall_stderr": 0.0024081610318172435}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.758300806812453, "bleu_stderr": 0.08007657091232745, "rouge1_fmeasure": 0.20138730742156502, "rouge1_fmeasure_stderr": 0.002383767388064678, "rouge1_precision": 0.1467728287510969, "rouge1_precision_stderr": 0.0018798387621829981, "rouge1_recall": 0.3426621610043869, "rouge1_recall_stderr": 0.004085220051925251, "rouge2_fmeasure": 0.04284406630002153, "rouge2_fmeasure_stderr": 0.0014348419140421667, "rouge2_precision": 0.030737676391623, "rouge2_precision_stderr": 0.001042998118809114, "rouge2_recall": 0.07530608814449148, "rouge2_recall_stderr": 0.002579685315939481, "rougeL_fmeasure": 0.1524394323338251, "rougeL_fmeasure_stderr": 0.0017896584767211985, "rougeL_precision": 0.11080294912069388, "rougeL_precision_stderr": 0.0013883854204739152, "rougeL_recall": 0.2613439394110765, "rougeL_recall_stderr": 0.0032334192933790623, "rougeLsum_fmeasure": 0.15852820727803194, "rougeLsum_fmeasure_stderr": 0.002016897117326563, "rougeLsum_precision": 0.11517373209464234, "rougeLsum_precision_stderr": 0.0015432535916910508, "rougeLsum_recall": 0.27156653646651985, "rougeLsum_recall_stderr": 0.003583906011404639}}, "1": {"article_DOC_summary": {"bleu": 1.068521917297439, "bleu_stderr": 0.07081554469607021, "rouge1_fmeasure": 0.161966647405913, "rouge1_fmeasure_stderr": 0.002275508088552081, "rouge1_precision": 0.1150142004518967, "rouge1_precision_stderr": 0.0016979295763785898, "rouge1_recall": 0.2853687722776007, "rouge1_recall_stderr": 0.003887322902695504, "rouge2_fmeasure": 0.027241156975209957, "rouge2_fmeasure_stderr": 0.0011850876332815702, "rouge2_precision": 0.01913886386721117, "rouge2_precision_stderr": 0.0008334256871780643, "rouge2_recall": 0.049352067674168415, "rouge2_recall_stderr": 0.0022154147154658994, "rougeL_fmeasure": 0.12635482853943877, "rougeL_fmeasure_stderr": 0.0016921288128198752, "rougeL_precision": 0.08946843700175185, "rougeL_precision_stderr": 0.0012456339897073143, "rougeL_recall": 0.2246153573495094, "rougeL_recall_stderr": 0.0030778149244190824, "rougeLsum_fmeasure": 0.12946181025819034, "rougeLsum_fmeasure_stderr": 0.0018452169275890971, "rougeLsum_precision": 0.09164639183215616, "rougeLsum_precision_stderr": 0.0013537084215489445, "rougeLsum_recall": 0.23014753910837163, "rougeLsum_recall_stderr": 0.0033378771042977774}}, "2": {"article_DOC_summary": {"bleu": 1.1404707134666303, "bleu_stderr": 0.0763074063462816, "rouge1_fmeasure": 0.16316915996732448, "rouge1_fmeasure_stderr": 0.00222219612710695, "rouge1_precision": 0.11588212536153684, "rouge1_precision_stderr": 0.0016624816842844062, "rouge1_recall": 0.28715636969862207, "rouge1_recall_stderr": 0.0037238202768370603, "rouge2_fmeasure": 0.02759041092903475, "rouge2_fmeasure_stderr": 0.001224973665979267, "rouge2_precision": 0.019488585481011924, "rouge2_precision_stderr": 0.0008699986075808974, "rouge2_recall": 0.049050060157236515, "rouge2_recall_stderr": 0.002197822228921419, "rougeL_fmeasure": 0.12776510085518877, "rougeL_fmeasure_stderr": 0.0016759148718658386, "rougeL_precision": 0.09059303186666452, "rougeL_precision_stderr": 0.0012486685404702663, "rougeL_recall": 0.22603558793942302, "rougeL_recall_stderr": 0.0028955051705399438, "rougeLsum_fmeasure": 0.13236674212783153, "rougeLsum_fmeasure_stderr": 0.0018430388687133352, "rougeLsum_precision": 0.09381300472436582, "rougeLsum_precision_stderr": 0.0013634684479284505, "rougeLsum_recall": 0.2343891889138819, "rougeLsum_recall_stderr": 0.0032048386156111825}}, "3": {"article_DOC_summary": {"bleu": 1.221044981008685, "bleu_stderr": 0.10868838485790933, "rouge1_fmeasure": 0.15807668591689616, "rouge1_fmeasure_stderr": 0.002443441595112497, "rouge1_precision": 0.11488434713804944, "rouge1_precision_stderr": 0.001955627749787486, "rouge1_recall": 0.2731232424111669, "rouge1_recall_stderr": 0.004125158821806333, "rouge2_fmeasure": 0.028243845157073965, "rouge2_fmeasure_stderr": 0.0012478840231055872, "rouge2_precision": 0.020292464913240202, "rouge2_precision_stderr": 0.0009168973933821024, "rouge2_recall": 0.05019821733774533, "rouge2_recall_stderr": 0.0022708942813293377, "rougeL_fmeasure": 0.12359221823446583, "rougeL_fmeasure_stderr": 0.0018584557529039685, "rougeL_precision": 0.08974785934027785, "rougeL_precision_stderr": 0.0014932173090276594, "rougeL_recall": 0.2148635466327155, "rougeL_recall_stderr": 0.003264459100616069, "rougeLsum_fmeasure": 0.1272082040074634, "rougeLsum_fmeasure_stderr": 0.002023846459874765, "rougeLsum_precision": 0.09230375665572382, "rougeLsum_precision_stderr": 0.0015999981344002648, "rougeLsum_recall": 0.2209621362049808, "rougeLsum_recall_stderr": 0.003503729625386469}}, "4": {"article_DOC_summary": {"bleu": 0.459228506453719, "bleu_stderr": 0.09303491692744054, "rouge1_fmeasure": 0.04351243098574765, "rouge1_fmeasure_stderr": 0.002432621516064442, "rouge1_precision": 0.036476341357403325, "rouge1_precision_stderr": 0.002211874872915692, "rouge1_recall": 0.06909099731369842, "rouge1_recall_stderr": 0.003956965010765941, "rouge2_fmeasure": 0.006625288687720446, "rouge2_fmeasure_stderr": 0.0006850711797057065, "rouge2_precision": 0.00529115024012113, "rouge2_precision_stderr": 0.0006656114996886191, "rouge2_recall": 0.01125719544920813, "rouge2_recall_stderr": 0.0011999082655028018, "rougeL_fmeasure": 0.03310688284895296, "rougeL_fmeasure_stderr": 0.001826231174044403, "rougeL_precision": 0.02806849542123108, "rougeL_precision_stderr": 0.00172463867846649, "rougeL_recall": 0.05277609289610784, "rougeL_recall_stderr": 0.00301816505171875, "rougeLsum_fmeasure": 0.03542151450287807, "rougeLsum_fmeasure_stderr": 0.001972920506883744, "rougeLsum_precision": 0.029786599311555296, "rougeLsum_precision_stderr": 0.0018070981054378132, "rougeLsum_recall": 0.05651726106694229, "rougeLsum_recall_stderr": 0.003266095361229875}}, "5": {"article_DOC_summary": {"bleu": 1.94523582636457e-40, "bleu_stderr": 1.770797257142403e-34, "rouge1_fmeasure": 0.001667584644670821, "rouge1_fmeasure_stderr": 0.0004911027706634228, "rouge1_precision": 0.001970346939188046, "rouge1_precision_stderr": 0.0006037527679422855, "rouge1_recall": 0.0015162376960444791, "rouge1_recall_stderr": 0.0004358066253493901, "rouge2_fmeasure": 4.083966348117292e-05, "rouge2_fmeasure_stderr": 4.083966348117434e-05, "rouge2_precision": 5.04489960649783e-05, "rouge2_precision_stderr": 5.044899606497865e-05, "rouge2_recall": 3.430531732418525e-05, "rouge2_recall_stderr": 3.430531732418596e-05, "rougeL_fmeasure": 0.001189737348545102, "rougeL_fmeasure_stderr": 0.00032571147073793893, "rougeL_precision": 0.0013938270230454883, "rougeL_precision_stderr": 0.0003900845162521499, "rougeL_recall": 0.0010974860194350932, "rougeL_recall_stderr": 0.000300906670923998, "rougeLsum_fmeasure": 0.0014194659154747587, "rougeLsum_fmeasure_stderr": 0.00040073671736681434, "rougeLsum_precision": 0.0016534992166799462, "rougeLsum_precision_stderr": 0.00047525493906725965, "rougeLsum_recall": 0.0013086229919921253, "rougeLsum_recall_stderr": 0.0003680353954949699}}}} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_0.csv b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..271a9c52084e4ea4f5164720a4d86d1723f6538b --- /dev/null +++ b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.347,0.01506047203170662,0 +anli_r2,acc,0.338,0.014965960710224479,0 +anli_r3,acc,0.3425,0.013704669762934732,0 +arc_challenge,acc,0.24914675767918087,0.012639407111926433,0 +arc_challenge,acc_norm,0.2619453924914676,0.012849054826858112,0 +arc_easy,acc,0.5412457912457912,0.010224815730255818,0 +arc_easy,acc_norm,0.4802188552188552,0.010251751199542723,0 +boolq,acc,0.5290519877675841,0.008730280528451538,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.31979092421002614,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.4261103365863374,0.004934995402995949,0 +hellaswag,acc_norm,0.5429197371041625,0.004971364031062592,0 +piqa,acc,0.735038084874864,0.010296557993316052,0 +piqa,acc_norm,0.733949945593036,0.01031003926335282,0 +rte,acc,0.4981949458483754,0.030096267148976626,0 +sciq,acc,0.788,0.012931481864938064,0 +sciq,acc_norm,0.71,0.014356395999905682,0 +storycloze_2016,acc,0.6803848209513629,0.010783759733730748,0 +winogrande,acc,0.5343330702446725,0.014019317531542575,0 diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json deleted file mode 100644 index 8e7bb1575db6012b8dff559a7729ae15b19ba9e7..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_0_lm-eval_global_step52452_2023-02-13-10-25-19_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.347, - "acc_stderr": 0.01506047203170662 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224479 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934732 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.31979092421002614 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.4261103365863374, - "acc_stderr": 0.004934995402995949, - "acc_norm": 0.5429197371041625, - "acc_norm_stderr": 0.004971364031062592 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.5343330702446725, - "acc_stderr": 0.014019317531542575 - }, - "storycloze_2016": { - "acc": 0.6803848209513629, - "acc_stderr": 0.010783759733730748 - }, - "boolq": { - "acc": 0.5290519877675841, - "acc_stderr": 0.008730280528451538 - }, - "arc_easy": { - "acc": 0.5412457912457912, - "acc_stderr": 0.010224815730255818, - "acc_norm": 0.4802188552188552, - "acc_norm_stderr": 0.010251751199542723 - }, - "arc_challenge": { - "acc": 0.24914675767918087, - "acc_stderr": 0.012639407111926433, - "acc_norm": 0.2619453924914676, - "acc_norm_stderr": 0.012849054826858112 - }, - "sciq": { - "acc": 0.788, - "acc_stderr": 0.012931481864938064, - "acc_norm": 0.71, - "acc_norm_stderr": 0.014356395999905682 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316052, - "acc_norm": 0.733949945593036, - "acc_norm_stderr": 0.01031003926335282 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_1.csv b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..2cba6e212101f6af9d3434acfc0db809aaefc981 --- /dev/null +++ b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.31,0.0146326386586329,0 +anli_r2,acc,0.348,0.015070604603768412,0 +anli_r3,acc,0.3566666666666667,0.013833742805050715,0 +arc_challenge,acc,0.25,0.012653835621466646,0 +arc_challenge,acc_norm,0.2815699658703072,0.013143376735009024,0 +arc_easy,acc,0.5576599326599326,0.010191334444220854,0 +arc_easy,acc_norm,0.5122053872053872,0.010256726235129018,0 +boolq,acc,0.5440366972477064,0.008711071588226805,1 +cb,acc,0.5535714285714286,0.06703189227942395,1 +cb,f1,0.37472283813747226,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.4219279028082055,0.004928578106026371,0 +hellaswag,acc_norm,0.544214299940251,0.004970234032728298,0 +piqa,acc,0.7290533188248096,0.010369718937426843,0 +piqa,acc_norm,0.7274211099020674,0.010389256803296004,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.851,0.011266140684632176,0 +sciq,acc_norm,0.809,0.012436787112179477,0 +storycloze_2016,acc,0.6809192944949225,0.010778970635312492,0 +winogrande,acc,0.5303867403314917,0.014026510839428739,0 diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json deleted file mode 100644 index b4e362d0cc7d353ab363ee26af37b42218b29251..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_1_lm-eval_global_step52452_2023-02-13-10-25-20_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.31, - "acc_stderr": 0.0146326386586329 - }, - "anli_r2": { - "acc": 0.348, - "acc_stderr": 0.015070604603768412 - }, - "anli_r3": { - "acc": 0.3566666666666667, - "acc_stderr": 0.013833742805050715 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.37472283813747226 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.4219279028082055, - "acc_stderr": 0.004928578106026371, - "acc_norm": 0.544214299940251, - "acc_norm_stderr": 0.004970234032728298 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5303867403314917, - "acc_stderr": 0.014026510839428739 - }, - "storycloze_2016": { - "acc": 0.6809192944949225, - "acc_stderr": 0.010778970635312492 - }, - "boolq": { - "acc": 0.5440366972477064, - "acc_stderr": 0.008711071588226805 - }, - "arc_easy": { - "acc": 0.5576599326599326, - "acc_stderr": 0.010191334444220854, - "acc_norm": 0.5122053872053872, - "acc_norm_stderr": 0.010256726235129018 - }, - "arc_challenge": { - "acc": 0.25, - "acc_stderr": 0.012653835621466646, - "acc_norm": 0.2815699658703072, - "acc_norm_stderr": 0.013143376735009024 - }, - "sciq": { - "acc": 0.851, - "acc_stderr": 0.011266140684632176, - "acc_norm": 0.809, - "acc_norm_stderr": 0.012436787112179477 - }, - "piqa": { - "acc": 0.7290533188248096, - "acc_stderr": 0.010369718937426843, - "acc_norm": 0.7274211099020674, - "acc_norm_stderr": 0.010389256803296004 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_2.csv b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..2fce33218c9a92df8ec87c741b8d9cae42b61865 --- /dev/null +++ b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928357,0 +anli_r2,acc,0.338,0.014965960710224487,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.2832764505119454,0.013167478735134575,0 +arc_easy,acc,0.5723905723905723,0.010151683397430675,0 +arc_easy,acc_norm,0.5404040404040404,0.010226230740889027,0 +boolq,acc,0.5382262996941896,0.008719460098106853,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3080848777867311,,1 +copa,acc,0.67,0.04725815626252609,0 +hellaswag,acc,0.4217287392949612,0.0049282634946167326,0 +hellaswag,acc_norm,0.5426209918342959,0.004971619995879752,0 +piqa,acc,0.7312295973884657,0.01034339294009001,0 +piqa,acc_norm,0.7257889009793254,0.010408618664933388,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.851,0.011266140684632178,0 +sciq,acc_norm,0.817,0.012233587399477825,0 +storycloze_2016,acc,0.6718332442544094,0.010858184920580577,0 +winogrande,acc,0.526440410418311,0.014032823874407225,0 diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json deleted file mode 100644 index 8a8567cf443571f633e75466cfb1e4c420656ab4..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_2_lm-eval_global_step52452_2023-02-13-10-25-19_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928357 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224487 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3080848777867311 - }, - "copa": { - "acc": 0.67, - "acc_stderr": 0.04725815626252609 - }, - "hellaswag": { - "acc": 0.4217287392949612, - "acc_stderr": 0.0049282634946167326, - "acc_norm": 0.5426209918342959, - "acc_norm_stderr": 0.004971619995879752 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.526440410418311, - "acc_stderr": 0.014032823874407225 - }, - "storycloze_2016": { - "acc": 0.6718332442544094, - "acc_stderr": 0.010858184920580577 - }, - "boolq": { - "acc": 0.5382262996941896, - "acc_stderr": 0.008719460098106853 - }, - "arc_easy": { - "acc": 0.5723905723905723, - "acc_stderr": 0.010151683397430675, - "acc_norm": 0.5404040404040404, - "acc_norm_stderr": 0.010226230740889027 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.2832764505119454, - "acc_norm_stderr": 0.013167478735134575 - }, - "sciq": { - "acc": 0.851, - "acc_stderr": 0.011266140684632178, - "acc_norm": 0.817, - "acc_norm_stderr": 0.012233587399477825 - }, - "piqa": { - "acc": 0.7312295973884657, - "acc_stderr": 0.01034339294009001, - "acc_norm": 0.7257889009793254, - "acc_norm_stderr": 0.010408618664933388 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_3.csv b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..f779c1621ecf0e7306b21b11fc2ade3de8140fea --- /dev/null +++ b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.321,0.014770821817934647,0 +anli_r2,acc,0.352,0.015110404505648666,0 +anli_r3,acc,0.3475,0.013751753243291854,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858115,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0 +arc_easy,acc,0.5614478114478114,0.01018201027547112,0 +arc_easy,acc_norm,0.5370370370370371,0.010231597249131058,0 +boolq,acc,0.5434250764525994,0.008712010793695305,1 +cb,acc,0.5714285714285714,0.06672848092813058,1 +cb,f1,0.3561111111111112,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.42113124875522806,0.0049273147294335564,0 +hellaswag,acc_norm,0.5461063533160725,0.0049685216080654635,0 +piqa,acc,0.7312295973884657,0.01034339294009001,0 +piqa,acc_norm,0.7257889009793254,0.010408618664933384,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.855,0.011139977517890129,0 +sciq,acc_norm,0.823,0.012075463420375061,0 +storycloze_2016,acc,0.6793158738642437,0.01079328909592361,0 +winogrande,acc,0.516179952644041,0.014045126130978603,0 diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json deleted file mode 100644 index bad30a4dc2924693f7d1477f8d740352385169cf..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_3_lm-eval_global_step52452_2023-02-13-10-25-20_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.321, - "acc_stderr": 0.014770821817934647 - }, - "anli_r2": { - "acc": 0.352, - "acc_stderr": 0.015110404505648666 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291854 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.3561111111111112 - }, - "copa": { - "acc": 0.7, - "acc_stderr": 0.046056618647183814 - }, - "hellaswag": { - "acc": 0.42113124875522806, - "acc_stderr": 0.0049273147294335564, - "acc_norm": 0.5461063533160725, - "acc_norm_stderr": 0.0049685216080654635 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.516179952644041, - "acc_stderr": 0.014045126130978603 - }, - "storycloze_2016": { - "acc": 0.6793158738642437, - "acc_stderr": 0.01079328909592361 - }, - "boolq": { - "acc": 0.5434250764525994, - "acc_stderr": 0.008712010793695305 - }, - "arc_easy": { - "acc": 0.5614478114478114, - "acc_stderr": 0.01018201027547112, - "acc_norm": 0.5370370370370371, - "acc_norm_stderr": 0.010231597249131058 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858115, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097222 - }, - "sciq": { - "acc": 0.855, - "acc_stderr": 0.011139977517890129, - "acc_norm": 0.823, - "acc_norm_stderr": 0.012075463420375061 - }, - "piqa": { - "acc": 0.7312295973884657, - "acc_stderr": 0.01034339294009001, - "acc_norm": 0.7257889009793254, - "acc_norm_stderr": 0.010408618664933384 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_4.csv b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..015eb5a093ac7064f5e0582e6fd4fe8a1a17189f --- /dev/null +++ b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.357,0.01515852172148677,0 +anli_r3,acc,0.355,0.013819249004047298,0 +arc_challenge,acc,0.2508532423208191,0.01266819862131543,0 +arc_challenge,acc_norm,0.2858361774744027,0.013203196088537369,0 +arc_easy,acc,0.5622895622895623,0.010179856486006908,0 +arc_easy,acc_norm,0.5517676767676768,0.01020464512685694,0 +boolq,acc,0.5434250764525994,0.008712010793695303,1 +cb,acc,0.5892857142857143,0.06633634150359538,1 +cb,f1,0.3670995670995671,,1 +copa,acc,0.68,0.04688261722621504,0 +hellaswag,acc,0.4190400318661621,0.004923935749842496,0 +hellaswag,acc_norm,0.5459071898028282,0.004968705270086758,0 +piqa,acc,0.7306855277475517,0.010350004070588758,0 +piqa,acc_norm,0.7252448313384113,0.01041503367667607,0 +rte,acc,0.49458483754512633,0.030094698123239966,0 +sciq,acc,0.847,0.011389500459665542,0 +sciq,acc_norm,0.825,0.012021627157731965,0 +storycloze_2016,acc,0.6771779796900054,0.010812153082758841,0 +winogrande,acc,0.5217048145224941,0.014039239216484633,0 diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json deleted file mode 100644 index c90ca87739abed6795368110c0cbd191233580b3..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_4_lm-eval_global_step52452_2023-02-13-10-25-19_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.357, - "acc_stderr": 0.01515852172148677 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.013819249004047298 - }, - "cb": { - "acc": 0.5892857142857143, - "acc_stderr": 0.06633634150359538, - "f1": 0.3670995670995671 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621504 - }, - "hellaswag": { - "acc": 0.4190400318661621, - "acc_stderr": 0.004923935749842496, - "acc_norm": 0.5459071898028282, - "acc_norm_stderr": 0.004968705270086758 - }, - "rte": { - "acc": 0.49458483754512633, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5217048145224941, - "acc_stderr": 0.014039239216484633 - }, - "storycloze_2016": { - "acc": 0.6771779796900054, - "acc_stderr": 0.010812153082758841 - }, - "boolq": { - "acc": 0.5434250764525994, - "acc_stderr": 0.008712010793695303 - }, - "arc_easy": { - "acc": 0.5622895622895623, - "acc_stderr": 0.010179856486006908, - "acc_norm": 0.5517676767676768, - "acc_norm_stderr": 0.01020464512685694 - }, - "arc_challenge": { - "acc": 0.2508532423208191, - "acc_stderr": 0.01266819862131543, - "acc_norm": 0.2858361774744027, - "acc_norm_stderr": 0.013203196088537369 - }, - "sciq": { - "acc": 0.847, - "acc_stderr": 0.011389500459665542, - "acc_norm": 0.825, - "acc_norm_stderr": 0.012021627157731965 - }, - "piqa": { - "acc": 0.7306855277475517, - "acc_stderr": 0.010350004070588758, - "acc_norm": 0.7252448313384113, - "acc_norm_stderr": 0.01041503367667607 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_5.csv b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..c7357ba4d9bea6a18e144fa900bbf946d49b9a68 --- /dev/null +++ b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.362,0.015204840912919498,0 +anli_r2,acc,0.351,0.015100563798316405,0 +anli_r3,acc,0.3466666666666667,0.013744022550571947,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858112,0 +arc_challenge,acc_norm,0.2909556313993174,0.013273077865907585,0 +arc_easy,acc,0.5622895622895623,0.010179856486006908,0 +arc_easy,acc_norm,0.5433501683501684,0.010221149650118186,0 +boolq,acc,0.5483180428134556,0.008704126206159355,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.29927667269439423,,1 +copa,acc,0.74,0.04408440022768079,0 +hellaswag,acc,0.41884086835291773,0.004923609207861548,0 +hellaswag,acc_norm,0.5483967337183828,0.004966351835028204,0 +piqa,acc,0.7268770402611534,0.010395730264453267,0 +piqa,acc_norm,0.7274211099020674,0.010389256803296004,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.855,0.011139977517890132,0 +sciq,acc_norm,0.846,0.0114199130650987,0 +storycloze_2016,acc,0.6809192944949225,0.010778970635312498,0 +winogrande,acc,0.5185477505919495,0.014042813708888378,0 diff --git a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json b/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json deleted file mode 100644 index 651698070e3a039220761b7c1c97b2803569cecd..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed3/evaluation/rankeval/2b855b4bc4seed3_5_lm-eval_global_step52452_2023-02-13-10-25-19_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.362, - "acc_stderr": 0.015204840912919498 - }, - "anli_r2": { - "acc": 0.351, - "acc_stderr": 0.015100563798316405 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.013744022550571947 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.29927667269439423 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768079 - }, - "hellaswag": { - "acc": 0.41884086835291773, - "acc_stderr": 0.004923609207861548, - "acc_norm": 0.5483967337183828, - "acc_norm_stderr": 0.004966351835028204 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5185477505919495, - "acc_stderr": 0.014042813708888378 - }, - "storycloze_2016": { - "acc": 0.6809192944949225, - "acc_stderr": 0.010778970635312498 - }, - "boolq": { - "acc": 0.5483180428134556, - "acc_stderr": 0.008704126206159355 - }, - "arc_easy": { - "acc": 0.5622895622895623, - "acc_stderr": 0.010179856486006908, - "acc_norm": 0.5433501683501684, - "acc_norm_stderr": 0.010221149650118186 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858112, - "acc_norm": 0.2909556313993174, - "acc_norm_stderr": 0.013273077865907585 - }, - "sciq": { - "acc": 0.855, - "acc_stderr": 0.011139977517890132, - "acc_norm": 0.846, - "acc_norm_stderr": 0.0114199130650987 - }, - "piqa": { - "acc": 0.7268770402611534, - "acc_stderr": 0.010395730264453267, - "acc_norm": 0.7274211099020674, - "acc_norm_stderr": 0.010389256803296004 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/generation/merged.csv b/2b855b4bc4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..89f22f5648c4eb92494ec09640a6badbbc3e66c6 --- /dev/null +++ b/2b855b4bc4seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.04373600830424405 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.04373600830424405 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1292295827254539 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1292295827254539 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1525672254265503 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1525672254265503 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.16433215272682788 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.16433215272682788 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.17075806903946023 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.17075806903946023 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.17839351489974706 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17839351489974706 +e2e_nlg_cleaned,5,average,multiple,0.13983609218704723 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04349769069606019 +gem_xsum,0,median,rouge2_fmeasure,0.04349769069606019 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.028906541802651994 +gem_xsum,1,median,rouge2_fmeasure,0.028906541802651994 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.026854897047199597 +gem_xsum,2,median,rouge2_fmeasure,0.026854897047199597 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.02488030276229101 +gem_xsum,3,median,rouge2_fmeasure,0.02488030276229101 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.006274952888784966 +gem_xsum,4,median,rouge2_fmeasure,0.006274952888784966 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00031517045471688554 +gem_xsum,5,median,rouge2_fmeasure,0.00031517045471688554 +gem_xsum,5,average,multiple,0.021788259275284105 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.050128803450783736 +web_nlg_en,0,median,rouge2_fmeasure,0.050128803450783736 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.049466402717971425 +web_nlg_en,1,median,rouge2_fmeasure,0.049466402717971425 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.050548594865136715 +web_nlg_en,2,median,rouge2_fmeasure,0.050548594865136715 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.049455987234530444 +web_nlg_en,3,median,rouge2_fmeasure,0.049455987234530444 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05155366582980554 +web_nlg_en,4,median,rouge2_fmeasure,0.05155366582980554 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.051786927845675346 +web_nlg_en,5,median,rouge2_fmeasure,0.051786927845675346 +web_nlg_en,5,average,multiple,0.0504900636573172 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03373913758548128 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03373913758548128 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03531089483964126 +wiki_lingua_en,1,median,rouge2_fmeasure,0.03531089483964126 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03996859475236519 +wiki_lingua_en,2,median,rouge2_fmeasure,0.03996859475236519 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03285990193579709 +wiki_lingua_en,3,median,rouge2_fmeasure,0.03285990193579709 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.011107909854854566 +wiki_lingua_en,4,median,rouge2_fmeasure,0.011107909854854566 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0017959526330645192 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0017959526330645192 +wiki_lingua_en,5,average,multiple,0.025797065266867317 diff --git a/2b855b4bc4seed4/evaluation/generation/merged.json b/2b855b4bc4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..17b564e5b15c743c34b6c0793522c40f115a74ad --- /dev/null +++ b/2b855b4bc4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.34324276920868313, "bleu_stderr": 0.04042884516669532, "rouge1_fmeasure": 0.10692073812276545, "rouge1_fmeasure_stderr": 0.002228211567230255, "rouge1_precision": 0.07296789069612485, "rouge1_precision_stderr": 0.0019521199064113212, "rouge1_recall": 0.291055417618012, "rouge1_recall_stderr": 0.004795229126860099, "rouge2_fmeasure": 0.050128803450783736, "rouge2_fmeasure_stderr": 0.0013852515541962897, "rouge2_precision": 0.033880944183910085, "rouge2_precision_stderr": 0.001157798532806098, "rouge2_recall": 0.1391368711508087, "rouge2_recall_stderr": 0.00321540122032897, "rougeL_fmeasure": 0.10133416610225789, "rougeL_fmeasure_stderr": 0.0020259994822571046, "rougeL_precision": 0.06856142162768449, "rougeL_precision_stderr": 0.0017142287836628277, "rougeL_recall": 0.27977592769537585, "rougeL_recall_stderr": 0.004631211297577514, "rougeLsum_fmeasure": 0.10073859159017663, "rougeLsum_fmeasure_stderr": 0.0020639190617154223, "rougeLsum_precision": 0.06867374798150541, "rougeLsum_precision_stderr": 0.0017972345258701681, "rougeLsum_recall": 0.2746713285482631, "rougeLsum_recall_stderr": 0.0044596496901865}}, "1": {"PALM_prompt": {"bleu": 0.3972496417987282, "bleu_stderr": 0.036010019939447795, "rouge1_fmeasure": 0.11055693009097887, "rouge1_fmeasure_stderr": 0.0020733351759109737, "rouge1_precision": 0.07173716490166453, "rouge1_precision_stderr": 0.0016575699247925147, "rouge1_recall": 0.34974653283485535, "rouge1_recall_stderr": 0.005075045161820977, "rouge2_fmeasure": 0.049466402717971425, "rouge2_fmeasure_stderr": 0.0012737918893369865, "rouge2_precision": 0.03197497485855603, "rouge2_precision_stderr": 0.000962460798718954, "rouge2_recall": 0.16023628528666056, "rouge2_recall_stderr": 0.0033864329911046445, "rougeL_fmeasure": 0.10154030260144617, "rougeL_fmeasure_stderr": 0.0017877836104798683, "rougeL_precision": 0.06570458314483123, "rougeL_precision_stderr": 0.0014418681663620739, "rougeL_recall": 0.3233748614144031, "rougeL_recall_stderr": 0.004572558510992172, "rougeLsum_fmeasure": 0.10435973514066761, "rougeLsum_fmeasure_stderr": 0.0019205631477026942, "rougeLsum_precision": 0.06768578595454278, "rougeLsum_precision_stderr": 0.0015421117795754444, "rougeLsum_recall": 0.33004228573250355, "rougeLsum_recall_stderr": 0.004669294869942142}}, "2": {"PALM_prompt": {"bleu": 0.42251969127538314, "bleu_stderr": 0.02840731592399811, "rouge1_fmeasure": 0.11333703477812178, "rouge1_fmeasure_stderr": 0.0019145559318279055, "rouge1_precision": 0.07268811468700499, "rouge1_precision_stderr": 0.0014465201349766466, "rouge1_recall": 0.3620024921684062, "rouge1_recall_stderr": 0.004758151532103667, "rouge2_fmeasure": 0.050548594865136715, "rouge2_fmeasure_stderr": 0.0011854559175601424, "rouge2_precision": 0.0322283688367945, "rouge2_precision_stderr": 0.0008502482679835013, "rouge2_recall": 0.16846757613232086, "rouge2_recall_stderr": 0.0033455697709206113, "rougeL_fmeasure": 0.10419580129010868, "rougeL_fmeasure_stderr": 0.0017010035365024065, "rougeL_precision": 0.06667638034366834, "rougeL_precision_stderr": 0.001268912493823364, "rougeL_recall": 0.33245074977257466, "rougeL_recall_stderr": 0.004293122764050932, "rougeLsum_fmeasure": 0.10724340509994847, "rougeLsum_fmeasure_stderr": 0.0017949889583002074, "rougeLsum_precision": 0.0687634532617638, "rougeLsum_precision_stderr": 0.0013543210018833482, "rougeLsum_recall": 0.342503278280948, "rougeLsum_recall_stderr": 0.004455587539035164}}, "3": {"PALM_prompt": {"bleu": 0.44217653122192047, "bleu_stderr": 0.031593661762159365, "rouge1_fmeasure": 0.11295642160003114, "rouge1_fmeasure_stderr": 0.0018412364165310284, "rouge1_precision": 0.0722620425101873, "rouge1_precision_stderr": 0.0014410472737193202, "rouge1_recall": 0.37180819935826426, "rouge1_recall_stderr": 0.004981069985574128, "rouge2_fmeasure": 0.049455987234530444, "rouge2_fmeasure_stderr": 0.001156460006281891, "rouge2_precision": 0.03124446888024084, "rouge2_precision_stderr": 0.0008098682458917474, "rouge2_recall": 0.1725335855684002, "rouge2_recall_stderr": 0.00358627677859403, "rougeL_fmeasure": 0.10316046337053776, "rougeL_fmeasure_stderr": 0.0016436820049262717, "rougeL_precision": 0.06590153402573928, "rougeL_precision_stderr": 0.0012971519212857086, "rougeL_recall": 0.3403962032921045, "rougeL_recall_stderr": 0.004488926489007162, "rougeLsum_fmeasure": 0.10633344725295313, "rougeLsum_fmeasure_stderr": 0.0017238854661554687, "rougeLsum_precision": 0.06804426975589642, "rougeLsum_precision_stderr": 0.0013611694736382926, "rougeLsum_recall": 0.35029114634711506, "rougeLsum_recall_stderr": 0.004655557324214912}}, "4": {"PALM_prompt": {"bleu": 0.48314498329102984, "bleu_stderr": 0.03787399283643099, "rouge1_fmeasure": 0.11616961405282918, "rouge1_fmeasure_stderr": 0.0018477446407489302, "rouge1_precision": 0.07367578522040237, "rouge1_precision_stderr": 0.001357741317192707, "rouge1_recall": 0.38526513393710116, "rouge1_recall_stderr": 0.0050046463849835994, "rouge2_fmeasure": 0.05155366582980554, "rouge2_fmeasure_stderr": 0.0011476664579999575, "rouge2_precision": 0.03254509062308702, "rouge2_precision_stderr": 0.0008124510427002572, "rouge2_recall": 0.18245368397193032, "rouge2_recall_stderr": 0.003621082243508599, "rougeL_fmeasure": 0.10503664377918179, "rougeL_fmeasure_stderr": 0.0016291191467623064, "rougeL_precision": 0.06652893246542035, "rougeL_precision_stderr": 0.0011865627422465272, "rougeL_recall": 0.34825535701930654, "rougeL_recall_stderr": 0.004432288074688195, "rougeLsum_fmeasure": 0.10891425804634698, "rougeLsum_fmeasure_stderr": 0.0017274736858056286, "rougeLsum_precision": 0.06910909992324601, "rougeLsum_precision_stderr": 0.0012711711038927995, "rougeLsum_recall": 0.3606860119104523, "rougeLsum_recall_stderr": 0.004607464365924947}}, "5": {"PALM_prompt": {"bleu": 0.5214927210017932, "bleu_stderr": 0.0362676558726496, "rouge1_fmeasure": 0.11586936549888076, "rouge1_fmeasure_stderr": 0.0018181557055337912, "rouge1_precision": 0.07331972567362008, "rouge1_precision_stderr": 0.0013290524236173411, "rouge1_recall": 0.3872380704555046, "rouge1_recall_stderr": 0.004945081288637636, "rouge2_fmeasure": 0.051786927845675346, "rouge2_fmeasure_stderr": 0.0011483121139103775, "rouge2_precision": 0.03259185849665382, "rouge2_precision_stderr": 0.0008071851554128241, "rouge2_recall": 0.18547086709031613, "rouge2_recall_stderr": 0.0036318259271542956, "rougeL_fmeasure": 0.10495145980578367, "rougeL_fmeasure_stderr": 0.0016193255205357157, "rougeL_precision": 0.06637750097675432, "rougeL_precision_stderr": 0.0011749267430377773, "rougeL_recall": 0.3506157495992537, "rougeL_recall_stderr": 0.004394772482409434, "rougeLsum_fmeasure": 0.1083533741013584, "rougeLsum_fmeasure_stderr": 0.0016892766658439435, "rougeLsum_precision": 0.06860815773385463, "rougeLsum_precision_stderr": 0.0012378320564745045, "rougeLsum_recall": 0.36194862147523427, "rougeLsum_recall_stderr": 0.00454162088691516}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4403549482069316, "bleu_stderr": 0.045150474547351716, "rouge1_fmeasure": 0.1734265842576362, "rouge1_fmeasure_stderr": 0.0018073734851827036, "rouge1_precision": 0.14896588926048365, "rouge1_precision_stderr": 0.001865117428965681, "rouge1_recall": 0.25114303955792244, "rouge1_recall_stderr": 0.0025976582244150812, "rouge2_fmeasure": 0.03373913758548128, "rouge2_fmeasure_stderr": 0.000820013621771073, "rouge2_precision": 0.028636993661579183, "rouge2_precision_stderr": 0.0007209381730040376, "rouge2_recall": 0.051264670014572335, "rouge2_recall_stderr": 0.0013788410352771731, "rougeL_fmeasure": 0.1362820231078758, "rougeL_fmeasure_stderr": 0.0012958852142597713, "rougeL_precision": 0.11575540213255162, "rougeL_precision_stderr": 0.001317609265683095, "rougeL_recall": 0.2023312645906629, "rougeL_recall_stderr": 0.0021282744490921358, "rougeLsum_fmeasure": 0.1605617081026496, "rougeLsum_fmeasure_stderr": 0.0016680863093293033, "rougeLsum_precision": 0.13774039990991838, "rougeLsum_precision_stderr": 0.0017176432202830143, "rougeLsum_recall": 0.2332759037319789, "rougeLsum_recall_stderr": 0.0024279209798112344}}, "1": {"tldr_en": {"bleu": 1.6689742211557324, "bleu_stderr": 0.07509434935896003, "rouge1_fmeasure": 0.17951280413847304, "rouge1_fmeasure_stderr": 0.0018417106160351696, "rouge1_precision": 0.15474506869496663, "rouge1_precision_stderr": 0.001924827942957213, "rouge1_recall": 0.2591243180864731, "rouge1_recall_stderr": 0.002671683005528829, "rouge2_fmeasure": 0.03531089483964126, "rouge2_fmeasure_stderr": 0.0008481970314752401, "rouge2_precision": 0.030277949913923696, "rouge2_precision_stderr": 0.0007642687050729942, "rouge2_recall": 0.05306919673481177, "rouge2_recall_stderr": 0.0014537388291512529, "rougeL_fmeasure": 0.13458420195877946, "rougeL_fmeasure_stderr": 0.001288387880327804, "rougeL_precision": 0.11492338397756324, "rougeL_precision_stderr": 0.0013355284948114493, "rougeL_recall": 0.19848762091396968, "rougeL_recall_stderr": 0.002090392684586368, "rougeLsum_fmeasure": 0.16799034896901963, "rougeLsum_fmeasure_stderr": 0.0017118989168442727, "rougeLsum_precision": 0.14465005867943093, "rougeLsum_precision_stderr": 0.001791187616110895, "rougeLsum_recall": 0.24326279426260988, "rougeLsum_recall_stderr": 0.0025168918522164223}}, "2": {"tldr_en": {"bleu": 1.9059345319820256, "bleu_stderr": 0.0862212869590945, "rouge1_fmeasure": 0.1889648535524678, "rouge1_fmeasure_stderr": 0.0019000993723643923, "rouge1_precision": 0.16432279281163434, "rouge1_precision_stderr": 0.002057134637006594, "rouge1_recall": 0.27061170607405366, "rouge1_recall_stderr": 0.0026474390658532856, "rouge2_fmeasure": 0.03996859475236519, "rouge2_fmeasure_stderr": 0.0009087208697000767, "rouge2_precision": 0.034948319334269146, "rouge2_precision_stderr": 0.0008626543029884114, "rouge2_recall": 0.058454120347305606, "rouge2_recall_stderr": 0.0014545321339646503, "rougeL_fmeasure": 0.1421915303578599, "rougeL_fmeasure_stderr": 0.0013323621746892285, "rougeL_precision": 0.12271277257937263, "rougeL_precision_stderr": 0.0014654131371235144, "rougeL_recall": 0.20810537107894553, "rougeL_recall_stderr": 0.0020911069991223358, "rougeLsum_fmeasure": 0.17635545319070747, "rougeLsum_fmeasure_stderr": 0.0017634570224832283, "rougeLsum_precision": 0.15307857680843284, "rougeLsum_precision_stderr": 0.0019064413413230463, "rougeLsum_recall": 0.25336888708282673, "rougeLsum_recall_stderr": 0.002489536643702081}}, "3": {"tldr_en": {"bleu": 1.895715703661013, "bleu_stderr": 0.07552964842924252, "rouge1_fmeasure": 0.15929322661166584, "rouge1_fmeasure_stderr": 0.002114487389427868, "rouge1_precision": 0.14325397224977207, "rouge1_precision_stderr": 0.002254999799100571, "rouge1_recall": 0.22707822401184713, "rouge1_recall_stderr": 0.003071111337726668, "rouge2_fmeasure": 0.03285990193579709, "rouge2_fmeasure_stderr": 0.0008660466252033046, "rouge2_precision": 0.029330638084378496, "rouge2_precision_stderr": 0.0008952307905709193, "rouge2_recall": 0.0492354490310444, "rouge2_recall_stderr": 0.001505610200406455, "rougeL_fmeasure": 0.12044846330588237, "rougeL_fmeasure_stderr": 0.0015231451799974103, "rougeL_precision": 0.10796029825540078, "rougeL_precision_stderr": 0.001666436339379942, "rougeL_recall": 0.17524602854806604, "rougeL_recall_stderr": 0.0024244775259192877, "rougeLsum_fmeasure": 0.14833001620991554, "rougeLsum_fmeasure_stderr": 0.0019615869920657646, "rougeLsum_precision": 0.13319101578859208, "rougeLsum_precision_stderr": 0.0020937794109719032, "rougeLsum_recall": 0.21233577931531242, "rougeLsum_recall_stderr": 0.0028869186285355375}}, "4": {"tldr_en": {"bleu": 0.43070391501801636, "bleu_stderr": 0.03941510687574561, "rouge1_fmeasure": 0.0529041977369453, "rouge1_fmeasure_stderr": 0.0017979295348687615, "rouge1_precision": 0.04971546533920332, "rouge1_precision_stderr": 0.0018758824122665782, "rouge1_recall": 0.07786948156585018, "rouge1_recall_stderr": 0.0026939403926776653, "rouge2_fmeasure": 0.011107909854854566, "rouge2_fmeasure_stderr": 0.0005804785867448148, "rouge2_precision": 0.009778261921657906, "rouge2_precision_stderr": 0.0005392033613436508, "rouge2_recall": 0.017671831995115884, "rouge2_recall_stderr": 0.0010642012989834832, "rougeL_fmeasure": 0.04115816659548844, "rougeL_fmeasure_stderr": 0.001377641479175028, "rougeL_precision": 0.03857956276540815, "rougeL_precision_stderr": 0.001456781195110897, "rougeL_recall": 0.06208123032893519, "rougeL_recall_stderr": 0.0021854142416682905, "rougeLsum_fmeasure": 0.04937190425358539, "rougeLsum_fmeasure_stderr": 0.0016784941938438474, "rougeLsum_precision": 0.04645064246765507, "rougeLsum_precision_stderr": 0.0017647304000012298, "rougeLsum_recall": 0.07283306584696218, "rougeLsum_recall_stderr": 0.0025343186809208625}}, "5": {"tldr_en": {"bleu": 1.5560113178772187e-06, "bleu_stderr": 2.689410668139988e-06, "rouge1_fmeasure": 0.008350920891435024, "rouge1_fmeasure_stderr": 0.0007902623971411311, "rouge1_precision": 0.008251810241003887, "rouge1_precision_stderr": 0.0008580222675520164, "rouge1_recall": 0.012336458255069087, "rouge1_recall_stderr": 0.0011789335289530474, "rouge2_fmeasure": 0.0017959526330645192, "rouge2_fmeasure_stderr": 0.00023834532490981777, "rouge2_precision": 0.001674827113683945, "rouge2_precision_stderr": 0.00024120143278132748, "rouge2_recall": 0.0028090268802289384, "rouge2_recall_stderr": 0.00040392973961407477, "rougeL_fmeasure": 0.006692639786762086, "rougeL_fmeasure_stderr": 0.0006219348369924248, "rougeL_precision": 0.006630384281401625, "rougeL_precision_stderr": 0.0006917639321061648, "rougeL_recall": 0.010118137130244316, "rougeL_recall_stderr": 0.0009752530966177267, "rougeLsum_fmeasure": 0.007835592897123708, "rougeLsum_fmeasure_stderr": 0.0007371934697695601, "rougeLsum_precision": 0.007728876610024227, "rougeLsum_precision_stderr": 0.000806415842071974, "rougeLsum_recall": 0.011719048133466851, "rougeLsum_recall_stderr": 0.0011256754814151107}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.6204704591838253, "bleu_stderr": 0.08864023928045604, "rouge1_fmeasure": 0.1497906259526338, "rouge1_fmeasure_stderr": 0.0016678306030798364, "rouge1_precision": 0.15125717601218172, "rouge1_precision_stderr": 0.0023558372444341617, "rouge1_recall": 0.20286933383721498, "rouge1_recall_stderr": 0.002435388865042906, "rouge2_fmeasure": 0.04373600830424405, "rouge2_fmeasure_stderr": 0.0009238633508127928, "rouge2_precision": 0.03776084392417247, "rouge2_precision_stderr": 0.0009387904035681707, "rouge2_recall": 0.06400855310119014, "rouge2_recall_stderr": 0.001385804018692614, "rougeL_fmeasure": 0.14434954038720266, "rougeL_fmeasure_stderr": 0.001602021034919324, "rougeL_precision": 0.14292365984852842, "rougeL_precision_stderr": 0.002129366603542978, "rougeL_recall": 0.19760689598243447, "rougeL_recall_stderr": 0.0024062244000259483, "rougeLsum_fmeasure": 0.12453060937076275, "rougeLsum_fmeasure_stderr": 0.0014551898993596674, "rougeLsum_precision": 0.1282876016831206, "rougeLsum_precision_stderr": 0.002164270239475782, "rougeLsum_recall": 0.16771657474098328, "rougeLsum_recall_stderr": 0.0020933084585483446}}, "1": {"generate_text_restaurant": {"bleu": 5.731069056126733, "bleu_stderr": 0.07513594348890652, "rouge1_fmeasure": 0.3181784610674861, "rouge1_fmeasure_stderr": 0.0019010397535698363, "rouge1_precision": 0.27476475366724734, "rouge1_precision_stderr": 0.0023947689274399914, "rouge1_recall": 0.44062994751259227, "rouge1_recall_stderr": 0.0027390523040172077, "rouge2_fmeasure": 0.1292295827254539, "rouge2_fmeasure_stderr": 0.0013837738893434852, "rouge2_precision": 0.11146835655476603, "rouge2_precision_stderr": 0.0014999278031438694, "rouge2_recall": 0.18293268594405332, "rouge2_recall_stderr": 0.002049679412148304, "rougeL_fmeasure": 0.25193663716210585, "rougeL_fmeasure_stderr": 0.0014328406221047658, "rougeL_precision": 0.2157501468818633, "rougeL_precision_stderr": 0.001798437071333258, "rougeL_recall": 0.3535031663462432, "rougeL_recall_stderr": 0.002333205954911591, "rougeLsum_fmeasure": 0.26199259864168617, "rougeLsum_fmeasure_stderr": 0.0017772587238443873, "rougeLsum_precision": 0.2263969595414202, "rougeLsum_precision_stderr": 0.0021285903592069546, "rougeLsum_recall": 0.3631932128047894, "rougeLsum_recall_stderr": 0.0025874384650419425}}, "2": {"generate_text_restaurant": {"bleu": 7.09064356868689, "bleu_stderr": 0.0945683397762938, "rouge1_fmeasure": 0.34222380460893276, "rouge1_fmeasure_stderr": 0.0018813838663570479, "rouge1_precision": 0.29988600177383756, "rouge1_precision_stderr": 0.0024869664022117975, "rouge1_recall": 0.4590316038918725, "rouge1_recall_stderr": 0.0026255768145353924, "rouge2_fmeasure": 0.1525672254265503, "rouge2_fmeasure_stderr": 0.001478449851822228, "rouge2_precision": 0.13434297471676152, "rouge2_precision_stderr": 0.0017079553383250112, "rouge2_recall": 0.2078120981315888, "rouge2_recall_stderr": 0.0020792119693382356, "rougeL_fmeasure": 0.27605647746743284, "rougeL_fmeasure_stderr": 0.001493889362627944, "rougeL_precision": 0.24043130894984865, "rougeL_precision_stderr": 0.001960313070261648, "rougeL_recall": 0.3745438446520874, "rougeL_recall_stderr": 0.00235304414966667, "rougeLsum_fmeasure": 0.2878772385155684, "rougeLsum_fmeasure_stderr": 0.0018340557534748533, "rougeLsum_precision": 0.25244399776562937, "rougeLsum_precision_stderr": 0.0022920302649868077, "rougeLsum_recall": 0.386733837024959, "rougeLsum_recall_stderr": 0.0025797034203399434}}, "3": {"generate_text_restaurant": {"bleu": 7.957506717896374, "bleu_stderr": 0.11871158835592002, "rouge1_fmeasure": 0.3632065994066894, "rouge1_fmeasure_stderr": 0.0019288429364865594, "rouge1_precision": 0.3269599876552489, "rouge1_precision_stderr": 0.002484167058394505, "rouge1_recall": 0.46133517897096044, "rouge1_recall_stderr": 0.0026152317295696646, "rouge2_fmeasure": 0.16433215272682788, "rouge2_fmeasure_stderr": 0.0015761670674788884, "rouge2_precision": 0.14834444047631257, "rouge2_precision_stderr": 0.001760625724352804, "rouge2_recall": 0.21173692399268554, "rouge2_recall_stderr": 0.00211904981302154, "rougeL_fmeasure": 0.28614023387972254, "rougeL_fmeasure_stderr": 0.0015637821822143893, "rougeL_precision": 0.25598001773115986, "rougeL_precision_stderr": 0.0019516643326024168, "rougeL_recall": 0.36738424006280895, "rougeL_recall_stderr": 0.0023261416341824356, "rougeLsum_fmeasure": 0.30917462192271916, "rougeLsum_fmeasure_stderr": 0.0018974167020131198, "rougeLsum_precision": 0.2782983753758179, "rougeLsum_precision_stderr": 0.002304830743605155, "rougeLsum_recall": 0.39310540843973446, "rougeLsum_recall_stderr": 0.002553221781314415}}, "4": {"generate_text_restaurant": {"bleu": 8.612420501398658, "bleu_stderr": 0.10968680189440985, "rouge1_fmeasure": 0.3773351872242804, "rouge1_fmeasure_stderr": 0.0018967437984601723, "rouge1_precision": 0.3495371332976347, "rouge1_precision_stderr": 0.00239814840117431, "rouge1_recall": 0.4549367024803036, "rouge1_recall_stderr": 0.002499727517440357, "rouge2_fmeasure": 0.17075806903946023, "rouge2_fmeasure_stderr": 0.001565588347384688, "rouge2_precision": 0.15817173697944936, "rouge2_precision_stderr": 0.0016965674653651296, "rouge2_recall": 0.2084670091195078, "rouge2_recall_stderr": 0.0020200175294079474, "rougeL_fmeasure": 0.29071600168264844, "rougeL_fmeasure_stderr": 0.0015687462494299498, "rougeL_precision": 0.2678066088278802, "rougeL_precision_stderr": 0.0018862767167744801, "rougeL_recall": 0.35411349760127814, "rougeL_recall_stderr": 0.0022610910858217744, "rougeLsum_fmeasure": 0.3232789352760445, "rougeLsum_fmeasure_stderr": 0.0018986827571818375, "rougeLsum_precision": 0.29958429509294093, "rougeLsum_precision_stderr": 0.0022675998729044192, "rougeLsum_recall": 0.3900348351088319, "rougeLsum_recall_stderr": 0.0024852360626458066}}, "5": {"generate_text_restaurant": {"bleu": 9.341385615844723, "bleu_stderr": 0.19783570409866133, "rouge1_fmeasure": 0.392409617099057, "rouge1_fmeasure_stderr": 0.0019233339881067632, "rouge1_precision": 0.3760728836609097, "rouge1_precision_stderr": 0.002482092216655873, "rouge1_recall": 0.4510423912350261, "rouge1_recall_stderr": 0.002479833284719754, "rouge2_fmeasure": 0.17839351489974706, "rouge2_fmeasure_stderr": 0.0016164191401401606, "rouge2_precision": 0.17125002153484914, "rouge2_precision_stderr": 0.0018148494904918263, "rouge2_recall": 0.20712823323934393, "rouge2_recall_stderr": 0.002010199172027898, "rougeL_fmeasure": 0.2969259329413085, "rougeL_fmeasure_stderr": 0.0015929636744810157, "rougeL_precision": 0.28362836873424674, "rougeL_precision_stderr": 0.0019794994201990926, "rougeL_recall": 0.3439631514313915, "rougeL_recall_stderr": 0.0021919460733085955, "rougeLsum_fmeasure": 0.3374436486811447, "rougeLsum_fmeasure_stderr": 0.0019180734213142077, "rougeLsum_precision": 0.3235503031920785, "rougeLsum_precision_stderr": 0.0023528028769729827, "rougeLsum_recall": 0.38796807844231657, "rougeLsum_recall_stderr": 0.0024282870555117227}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.722606902374208, "bleu_stderr": 0.06972480667504957, "rouge1_fmeasure": 0.203129901503601, "rouge1_fmeasure_stderr": 0.0024350828266786312, "rouge1_precision": 0.14737490289212898, "rouge1_precision_stderr": 0.001892038140282547, "rouge1_recall": 0.3474027426073489, "rouge1_recall_stderr": 0.0042151212136227936, "rouge2_fmeasure": 0.04349769069606019, "rouge2_fmeasure_stderr": 0.0014767027543151523, "rouge2_precision": 0.031023017059226807, "rouge2_precision_stderr": 0.0010752878149897476, "rouge2_recall": 0.07706904739706329, "rouge2_recall_stderr": 0.0026512280243714175, "rougeL_fmeasure": 0.14911450811970686, "rougeL_fmeasure_stderr": 0.0018096037290596233, "rougeL_precision": 0.1080451338404057, "rougeL_precision_stderr": 0.0014047773398183471, "rougeL_recall": 0.2567864410450209, "rougeL_recall_stderr": 0.003276744810379596, "rougeLsum_fmeasure": 0.16110001364094212, "rougeLsum_fmeasure_stderr": 0.0021006419030991353, "rougeLsum_precision": 0.11661581282001253, "rougeLsum_precision_stderr": 0.0015960783338988694, "rougeLsum_recall": 0.27717098989285094, "rougeLsum_recall_stderr": 0.0037613824574153037}}, "1": {"article_DOC_summary": {"bleu": 1.1326558448230488, "bleu_stderr": 0.0567817612667065, "rouge1_fmeasure": 0.16616093625535694, "rouge1_fmeasure_stderr": 0.002336150674135059, "rouge1_precision": 0.11820549120562207, "rouge1_precision_stderr": 0.0017490099739484472, "rouge1_recall": 0.29128227844780374, "rouge1_recall_stderr": 0.00393484390915183, "rouge2_fmeasure": 0.028906541802651994, "rouge2_fmeasure_stderr": 0.001206689869447212, "rouge2_precision": 0.02041194044165818, "rouge2_precision_stderr": 0.0008571145383783419, "rouge2_recall": 0.0516297890657349, "rouge2_recall_stderr": 0.0021970085014639293, "rougeL_fmeasure": 0.12713761308338653, "rougeL_fmeasure_stderr": 0.0017099383024712963, "rougeL_precision": 0.09020747086581987, "rougeL_precision_stderr": 0.0012635738558992584, "rougeL_recall": 0.224596238032614, "rougeL_recall_stderr": 0.003020772846382357, "rougeLsum_fmeasure": 0.13302177000366747, "rougeLsum_fmeasure_stderr": 0.001884470053396348, "rougeLsum_precision": 0.09437253179460942, "rougeLsum_precision_stderr": 0.001391507358175456, "rougeLsum_recall": 0.23481919660598263, "rougeLsum_recall_stderr": 0.003290883829227082}}, "2": {"article_DOC_summary": {"bleu": 1.0684029358963874, "bleu_stderr": 0.09162899746046885, "rouge1_fmeasure": 0.15848594424984871, "rouge1_fmeasure_stderr": 0.0022135604242122533, "rouge1_precision": 0.11227424806799667, "rouge1_precision_stderr": 0.0016480803427349486, "rouge1_recall": 0.2805726617211659, "rouge1_recall_stderr": 0.0037813624418682698, "rouge2_fmeasure": 0.026854897047199597, "rouge2_fmeasure_stderr": 0.0011943467727152312, "rouge2_precision": 0.018887270267195208, "rouge2_precision_stderr": 0.0008462339575811449, "rouge2_recall": 0.04845735236134109, "rouge2_recall_stderr": 0.0021922273973845633, "rougeL_fmeasure": 0.12430324757166848, "rougeL_fmeasure_stderr": 0.0016908828757021333, "rougeL_precision": 0.08787048422227783, "rougeL_precision_stderr": 0.0012451433447639056, "rougeL_recall": 0.22149791412931427, "rougeL_recall_stderr": 0.003021326314855878, "rougeLsum_fmeasure": 0.12766929924765744, "rougeLsum_fmeasure_stderr": 0.0018108730785654469, "rougeLsum_precision": 0.09023224804469579, "rougeLsum_precision_stderr": 0.0013306467797407004, "rougeLsum_recall": 0.22757256259783343, "rougeLsum_recall_stderr": 0.0032254714330591755}}, "3": {"article_DOC_summary": {"bleu": 1.1080718554312659, "bleu_stderr": 0.09221671705091612, "rouge1_fmeasure": 0.15311788076069605, "rouge1_fmeasure_stderr": 0.002346657325343273, "rouge1_precision": 0.11127804347917077, "rouge1_precision_stderr": 0.00188714340710608, "rouge1_recall": 0.26480535984228515, "rouge1_recall_stderr": 0.0039876524090106695, "rouge2_fmeasure": 0.02488030276229101, "rouge2_fmeasure_stderr": 0.0011941066388191626, "rouge2_precision": 0.01785532513731725, "rouge2_precision_stderr": 0.0008603887167120456, "rouge2_recall": 0.0438885071002277, "rouge2_recall_stderr": 0.0021869569295178293, "rougeL_fmeasure": 0.11943733705937881, "rougeL_fmeasure_stderr": 0.0017959230151588935, "rougeL_precision": 0.08638798075326262, "rougeL_precision_stderr": 0.001399188131883859, "rougeL_recall": 0.20809822478613527, "rougeL_recall_stderr": 0.0031652415111972873, "rougeLsum_fmeasure": 0.12456177206249605, "rougeLsum_fmeasure_stderr": 0.001949287175241194, "rougeLsum_precision": 0.09026426387234315, "rougeLsum_precision_stderr": 0.0015399891185623484, "rougeLsum_recall": 0.21655673202787679, "rougeLsum_recall_stderr": 0.0033891917161482476}}, "4": {"article_DOC_summary": {"bleu": 0.4355961718244034, "bleu_stderr": 0.10730902991243396, "rouge1_fmeasure": 0.04171409928206798, "rouge1_fmeasure_stderr": 0.002334746940072641, "rouge1_precision": 0.035582744924747466, "rouge1_precision_stderr": 0.002301780755869687, "rouge1_recall": 0.06601172171912116, "rouge1_recall_stderr": 0.0037975944892349933, "rouge2_fmeasure": 0.006274952888784966, "rouge2_fmeasure_stderr": 0.0006779712700297606, "rouge2_precision": 0.005097963265238305, "rouge2_precision_stderr": 0.0006709367956067867, "rouge2_recall": 0.01069068417529109, "rouge2_recall_stderr": 0.001206105024150595, "rougeL_fmeasure": 0.032466458652340914, "rougeL_fmeasure_stderr": 0.0017968588947872915, "rougeL_precision": 0.02838549235315442, "rougeL_precision_stderr": 0.001962677079059737, "rougeL_recall": 0.051429066058009745, "rougeL_recall_stderr": 0.002947657362326224, "rougeLsum_fmeasure": 0.034472575444136976, "rougeLsum_fmeasure_stderr": 0.0019296546570168342, "rougeLsum_precision": 0.02986408746144512, "rougeLsum_precision_stderr": 0.0020282070313633665, "rougeLsum_recall": 0.05478363428659324, "rougeLsum_recall_stderr": 0.003189231420973017}}, "5": {"article_DOC_summary": {"bleu": 5.202446746860472e-38, "bleu_stderr": 4.1140753358502506e-32, "rouge1_fmeasure": 0.0026034059646626788, "rouge1_fmeasure_stderr": 0.0007154978720001342, "rouge1_precision": 0.0029332145274227166, "rouge1_precision_stderr": 0.0008238877526645406, "rouge1_recall": 0.002411317388173472, "rouge1_recall_stderr": 0.0006543398747660343, "rouge2_fmeasure": 0.00031517045471688554, "rouge2_fmeasure_stderr": 0.0001418048032039371, "rouge2_precision": 0.00034753752844762835, "rouge2_precision_stderr": 0.0001492365813245593, "rouge2_recall": 0.0002959933148612394, "rouge2_recall_stderr": 0.00014045555523684472, "rougeL_fmeasure": 0.0018322053184873374, "rougeL_fmeasure_stderr": 0.00048318900425969316, "rougeL_precision": 0.002026109863162261, "rougeL_precision_stderr": 0.000540547077297625, "rougeL_recall": 0.001730474631608596, "rougeL_recall_stderr": 0.00045668964174527647, "rougeLsum_fmeasure": 0.002109764742993743, "rougeLsum_fmeasure_stderr": 0.0005666450650017948, "rougeLsum_precision": 0.002363026550333052, "rougeLsum_precision_stderr": 0.0006499884400186724, "rougeLsum_recall": 0.001969996796101712, "rougeLsum_recall_stderr": 0.0005237129218904748}}}} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_0.csv b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..9f041842e3e5d97be5b80dfdfc691313774366d0 --- /dev/null +++ b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932572,0 +anli_r2,acc,0.332,0.014899597242811482,0 +anli_r3,acc,0.33666666666666667,0.013647602942406398,0 +arc_challenge,acc,0.24061433447098976,0.012491468532390571,0 +arc_challenge,acc_norm,0.26109215017064846,0.012835523909473848,0 +arc_easy,acc,0.5526094276094277,0.010202832385415646,0 +arc_easy,acc_norm,0.484006734006734,0.010254533589288174,0 +boolq,acc,0.6009174311926605,0.008565077958836787,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.3479245283018868,,1 +copa,acc,0.72,0.045126085985421276,0 +hellaswag,acc,0.4231228838876718,0.004930448527146667,0 +hellaswag,acc_norm,0.5445130452101175,0.004969968458256173,0 +piqa,acc,0.7317736670293797,0.010336761992404485,0 +piqa,acc_norm,0.7323177366702938,0.01033011118937042,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.777,0.01316983084342567,0 +sciq,acc_norm,0.692,0.014606483127342761,0 +storycloze_2016,acc,0.677712453233565,0.01080746137499636,0 +winogrande,acc,0.5272296764009471,0.014031631629827694,0 diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json deleted file mode 100644 index 62a84acb2a1b7a1c9ca32123b0cb1c931f0bff89..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_0_lm-eval_global_step52452_2023-02-15-00-33-59_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932572 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811482 - }, - "anli_r3": { - "acc": 0.33666666666666667, - "acc_stderr": 0.013647602942406398 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.3479245283018868 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.045126085985421276 - }, - "hellaswag": { - "acc": 0.4231228838876718, - "acc_stderr": 0.004930448527146667, - "acc_norm": 0.5445130452101175, - "acc_norm_stderr": 0.004969968458256173 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5272296764009471, - "acc_stderr": 0.014031631629827694 - }, - "storycloze_2016": { - "acc": 0.677712453233565, - "acc_stderr": 0.01080746137499636 - }, - "boolq": { - "acc": 0.6009174311926605, - "acc_stderr": 0.008565077958836787 - }, - "arc_easy": { - "acc": 0.5526094276094277, - "acc_stderr": 0.010202832385415646, - "acc_norm": 0.484006734006734, - "acc_norm_stderr": 0.010254533589288174 - }, - "arc_challenge": { - "acc": 0.24061433447098976, - "acc_stderr": 0.012491468532390571, - "acc_norm": 0.26109215017064846, - "acc_norm_stderr": 0.012835523909473848 - }, - "sciq": { - "acc": 0.777, - "acc_stderr": 0.01316983084342567, - "acc_norm": 0.692, - "acc_norm_stderr": 0.014606483127342761 - }, - "piqa": { - "acc": 0.7317736670293797, - "acc_stderr": 0.010336761992404485, - "acc_norm": 0.7323177366702938, - "acc_norm_stderr": 0.01033011118937042 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_1.csv b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..4dbbda5393548b7b584ea0506c94ec71c03d57d7 --- /dev/null +++ b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121731,0 +anli_r2,acc,0.311,0.014645596385722694,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.2363481228668942,0.012414960524301836,0 +arc_challenge,acc_norm,0.2619453924914676,0.012849054826858112,0 +arc_easy,acc,0.5521885521885522,0.010203742451111525,0 +arc_easy,acc_norm,0.5218855218855218,0.010249950427234164,0 +boolq,acc,0.5963302752293578,0.008581220435616821,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.37566137566137564,,1 +copa,acc,0.69,0.04648231987117316,0 +hellaswag,acc,0.42123083051185023,0.004927473370720142,0 +hellaswag,acc_norm,0.5375423222465644,0.004975696076240853,0 +piqa,acc,0.7170837867247007,0.01050894917748968,0 +piqa,acc_norm,0.7187159956474428,0.010490509832327423,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.811,0.012386784588117712,0 +sciq,acc_norm,0.781,0.013084731950262019,0 +storycloze_2016,acc,0.6707642971672902,0.010867199207548986,0 +winogrande,acc,0.5343330702446725,0.01401931753154257,0 diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json deleted file mode 100644 index 3dc4d0b5d7665d6c9cfe179312c1ea3078e3810f..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_1_lm-eval_global_step52452_2023-02-15-00-33-59_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r2": { - "acc": 0.311, - "acc_stderr": 0.014645596385722694 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.37566137566137564 - }, - "copa": { - "acc": 0.69, - "acc_stderr": 0.04648231987117316 - }, - "hellaswag": { - "acc": 0.42123083051185023, - "acc_stderr": 0.004927473370720142, - "acc_norm": 0.5375423222465644, - "acc_norm_stderr": 0.004975696076240853 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5343330702446725, - "acc_stderr": 0.01401931753154257 - }, - "storycloze_2016": { - "acc": 0.6707642971672902, - "acc_stderr": 0.010867199207548986 - }, - "boolq": { - "acc": 0.5963302752293578, - "acc_stderr": 0.008581220435616821 - }, - "arc_easy": { - "acc": 0.5521885521885522, - "acc_stderr": 0.010203742451111525, - "acc_norm": 0.5218855218855218, - "acc_norm_stderr": 0.010249950427234164 - }, - "arc_challenge": { - "acc": 0.2363481228668942, - "acc_stderr": 0.012414960524301836, - "acc_norm": 0.2619453924914676, - "acc_norm_stderr": 0.012849054826858112 - }, - "sciq": { - "acc": 0.811, - "acc_stderr": 0.012386784588117712, - "acc_norm": 0.781, - "acc_norm_stderr": 0.013084731950262019 - }, - "piqa": { - "acc": 0.7170837867247007, - "acc_stderr": 0.01050894917748968, - "acc_norm": 0.7187159956474428, - "acc_norm_stderr": 0.010490509832327423 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_2.csv b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..105720cbb87fb4d9c4429e8bab56e559ce9b2a12 --- /dev/null +++ b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932573,0 +anli_r2,acc,0.356,0.015149042659306628,0 +anli_r3,acc,0.35,0.013774667009018552,0 +arc_challenge,acc,0.25,0.012653835621466646,0 +arc_challenge,acc_norm,0.27474402730375425,0.013044617212771227,0 +arc_easy,acc,0.5618686868686869,0.010180937100600074,0 +arc_easy,acc_norm,0.5437710437710438,0.010220394383722022,0 +boolq,acc,0.6018348623853211,0.008561755594317447,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.32997198879551815,,1 +copa,acc,0.67,0.04725815626252609,0 +hellaswag,acc,0.4206333399721171,0.004926518439372271,0 +hellaswag,acc_norm,0.5421230830511851,0.004972042602001377,0 +piqa,acc,0.7241566920565833,0.010427805502729115,0 +piqa,acc_norm,0.7263329706202394,0.010402184206229223,0 +rte,acc,0.5306859205776173,0.030039730592197816,0 +sciq,acc,0.834,0.011772110370812185,0 +sciq,acc_norm,0.805,0.012535235623319327,0 +storycloze_2016,acc,0.6712987707108499,0.010862700030538157,0 +winogrande,acc,0.5556432517758485,0.013965196769083555,0 diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json deleted file mode 100644 index a5830099adc9ffdd16564149909fd6728b23edf1..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_2_lm-eval_global_step52452_2023-02-15-00-33-59_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r2": { - "acc": 0.356, - "acc_stderr": 0.015149042659306628 - }, - "anli_r3": { - "acc": 0.35, - "acc_stderr": 0.013774667009018552 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.32997198879551815 - }, - "copa": { - "acc": 0.67, - "acc_stderr": 0.04725815626252609 - }, - "hellaswag": { - "acc": 0.4206333399721171, - "acc_stderr": 0.004926518439372271, - "acc_norm": 0.5421230830511851, - "acc_norm_stderr": 0.004972042602001377 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197816 - }, - "winogrande": { - "acc": 0.5556432517758485, - "acc_stderr": 0.013965196769083555 - }, - "storycloze_2016": { - "acc": 0.6712987707108499, - "acc_stderr": 0.010862700030538157 - }, - "boolq": { - "acc": 0.6018348623853211, - "acc_stderr": 0.008561755594317447 - }, - "arc_easy": { - "acc": 0.5618686868686869, - "acc_stderr": 0.010180937100600074, - "acc_norm": 0.5437710437710438, - "acc_norm_stderr": 0.010220394383722022 - }, - "arc_challenge": { - "acc": 0.25, - "acc_stderr": 0.012653835621466646, - "acc_norm": 0.27474402730375425, - "acc_norm_stderr": 0.013044617212771227 - }, - "sciq": { - "acc": 0.834, - "acc_stderr": 0.011772110370812185, - "acc_norm": 0.805, - "acc_norm_stderr": 0.012535235623319327 - }, - "piqa": { - "acc": 0.7241566920565833, - "acc_stderr": 0.010427805502729115, - "acc_norm": 0.7263329706202394, - "acc_norm_stderr": 0.010402184206229223 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_3.csv b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..aade2c91288d27f603bcdca39222bec9e599cb5e --- /dev/null +++ b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.014794927843348635,0 +anli_r2,acc,0.341,0.014998131348402706,0 +anli_r3,acc,0.3416666666666667,0.013696658778002519,0 +arc_challenge,acc,0.24573378839590443,0.012581033453730114,0 +arc_challenge,acc_norm,0.27559726962457337,0.013057169655761838,0 +arc_easy,acc,0.5513468013468014,0.010205540414612873,0 +arc_easy,acc_norm,0.5471380471380471,0.010214087372211392,0 +boolq,acc,0.5969418960244648,0.008579113210566454,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.3545905059989567,,1 +copa,acc,0.71,0.045604802157206845,0 +hellaswag,acc,0.4208325034853615,0.004926837572202166,0 +hellaswag,acc_norm,0.5448117904799841,0.004969701081068381,0 +piqa,acc,0.7230685527747551,0.010440499969334525,0 +piqa,acc_norm,0.7274211099020674,0.010389256803296007,0 +rte,acc,0.5631768953068592,0.029855247390314952,0 +sciq,acc,0.837,0.011686212712746842,0 +sciq,acc_norm,0.811,0.012386784588117707,0 +storycloze_2016,acc,0.6718332442544094,0.01085818492058058,0 +winogrande,acc,0.5303867403314917,0.014026510839428739,0 diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_3_lm-eval_global_step52452_2023-02-15-00-34-00_3shots_backup.json b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_3_lm-eval_global_step52452_2023-02-15-00-34-00_3shots_backup.json deleted file mode 100644 index 904e33f24120417a5bf9ae71ce2c0d5a34e15802..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_3_lm-eval_global_step52452_2023-02-15-00-34-00_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348635 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.014998131348402706 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002519 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.3545905059989567 - }, - "copa": { - "acc": 0.71, - "acc_stderr": 0.045604802157206845 - }, - "hellaswag": { - "acc": 0.4208325034853615, - "acc_stderr": 0.004926837572202166, - "acc_norm": 0.5448117904799841, - "acc_norm_stderr": 0.004969701081068381 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.029855247390314952 - }, - "winogrande": { - "acc": 0.5303867403314917, - "acc_stderr": 0.014026510839428739 - }, - "storycloze_2016": { - "acc": 0.6718332442544094, - "acc_stderr": 0.01085818492058058 - }, - "boolq": { - "acc": 0.5969418960244648, - "acc_stderr": 0.008579113210566454 - }, - "arc_easy": { - "acc": 0.5513468013468014, - "acc_stderr": 0.010205540414612873, - "acc_norm": 0.5471380471380471, - "acc_norm_stderr": 0.010214087372211392 - }, - "arc_challenge": { - "acc": 0.24573378839590443, - "acc_stderr": 0.012581033453730114, - "acc_norm": 0.27559726962457337, - "acc_norm_stderr": 0.013057169655761838 - }, - "sciq": { - "acc": 0.837, - "acc_stderr": 0.011686212712746842, - "acc_norm": 0.811, - "acc_norm_stderr": 0.012386784588117707 - }, - "piqa": { - "acc": 0.7230685527747551, - "acc_stderr": 0.010440499969334525, - "acc_norm": 0.7274211099020674, - "acc_norm_stderr": 0.010389256803296007 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_4.csv b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..9f99a172d732d89f5da46d8df792d240b3885235 --- /dev/null +++ b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928373,0 +anli_r2,acc,0.35,0.015090650341444233,0 +anli_r3,acc,0.3525,0.01379716491891836,0 +arc_challenge,acc,0.24914675767918087,0.012639407111926439,0 +arc_challenge,acc_norm,0.26706484641638223,0.012928933196496354,0 +arc_easy,acc,0.5652356902356902,0.010172083670402779,0 +arc_easy,acc_norm,0.5601851851851852,0.010185185185185323,0 +boolq,acc,0.5941896024464832,0.008588486726385772,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.2751141552511415,,1 +copa,acc,0.72,0.04512608598542128,0 +hellaswag,acc,0.42113124875522806,0.004927314729433558,0 +hellaswag,acc_norm,0.5439155546703844,0.004970497804772304,0 +piqa,acc,0.7274211099020674,0.010389256803296018,0 +piqa,acc_norm,0.7274211099020674,0.01038925680329601,0 +rte,acc,0.5595667870036101,0.029882123363118723,0 +sciq,acc,0.839,0.011628164696727183,0 +sciq,acc_norm,0.823,0.012075463420375061,0 +storycloze_2016,acc,0.6718332442544094,0.010858184920580577,0 +winogrande,acc,0.5295974743488555,0.014027843827840088,0 diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json deleted file mode 100644 index 1519c23e7b7734425f54854fcabbb219de58fff6..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_4_lm-eval_global_step52452_2023-02-15-00-33-59_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928373 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444233 - }, - "anli_r3": { - "acc": 0.3525, - "acc_stderr": 0.01379716491891836 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.2751141552511415 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542128 - }, - "hellaswag": { - "acc": 0.42113124875522806, - "acc_stderr": 0.004927314729433558, - "acc_norm": 0.5439155546703844, - "acc_norm_stderr": 0.004970497804772304 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118723 - }, - "winogrande": { - "acc": 0.5295974743488555, - "acc_stderr": 0.014027843827840088 - }, - "storycloze_2016": { - "acc": 0.6718332442544094, - "acc_stderr": 0.010858184920580577 - }, - "boolq": { - "acc": 0.5941896024464832, - "acc_stderr": 0.008588486726385772 - }, - "arc_easy": { - "acc": 0.5652356902356902, - "acc_stderr": 0.010172083670402779, - "acc_norm": 0.5601851851851852, - "acc_norm_stderr": 0.010185185185185323 - }, - "arc_challenge": { - "acc": 0.24914675767918087, - "acc_stderr": 0.012639407111926439, - "acc_norm": 0.26706484641638223, - "acc_norm_stderr": 0.012928933196496354 - }, - "sciq": { - "acc": 0.839, - "acc_stderr": 0.011628164696727183, - "acc_norm": 0.823, - "acc_norm_stderr": 0.012075463420375061 - }, - "piqa": { - "acc": 0.7274211099020674, - "acc_stderr": 0.010389256803296018, - "acc_norm": 0.7274211099020674, - "acc_norm_stderr": 0.01038925680329601 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_5.csv b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..d6fc17b0adc18ac578fb5633f2d3419686880a21 --- /dev/null +++ b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.01494414023379502,0 +anli_r2,acc,0.365,0.015231776226264909,0 +anli_r3,acc,0.3625,0.013883037874225514,0 +arc_challenge,acc,0.24658703071672355,0.012595726268790122,0 +arc_challenge,acc_norm,0.27986348122866894,0.013119040897725922,0 +arc_easy,acc,0.571969696969697,0.01015294331642627,0 +arc_easy,acc_norm,0.5517676767676768,0.010204645126856943,0 +boolq,acc,0.6055045871559633,0.008548152025770934,1 +cb,acc,0.5714285714285714,0.06672848092813058,1 +cb,f1,0.3697246436972464,,1 +copa,acc,0.72,0.04512608598542128,0 +hellaswag,acc,0.4228241386178052,0.004929983692795062,0 +hellaswag,acc_norm,0.5452101175064729,0.004969341773423513,0 +piqa,acc,0.719260065288357,0.010484325438311827,0 +piqa,acc_norm,0.7290533188248096,0.010369718937426843,0 +rte,acc,0.592057761732852,0.02958195251960619,0 +sciq,acc,0.844,0.011480235006122365,0 +sciq,acc_norm,0.827,0.011967214137559941,0 +storycloze_2016,acc,0.669695350080171,0.010876149841754857,0 +winogrande,acc,0.5351223362273086,0.014017773120881587,0 diff --git a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_5_lm-eval_global_step52452_2023-02-15-00-37-36_5shots_backup.json b/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_5_lm-eval_global_step52452_2023-02-15-00-37-36_5shots_backup.json deleted file mode 100644 index 0105aaf90e00939662677fdb7751e3fcee1ae32e..0000000000000000000000000000000000000000 --- a/2b855b4bc4seed4/evaluation/rankeval/2b855b4bc4seed4_5_lm-eval_global_step52452_2023-02-15-00-37-36_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.01494414023379502 - }, - "anli_r2": { - "acc": 0.365, - "acc_stderr": 0.015231776226264909 - }, - "anli_r3": { - "acc": 0.3625, - "acc_stderr": 0.013883037874225514 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.3697246436972464 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542128 - }, - "hellaswag": { - "acc": 0.4228241386178052, - "acc_stderr": 0.004929983692795062, - "acc_norm": 0.5452101175064729, - "acc_norm_stderr": 0.004969341773423513 - }, - "rte": { - "acc": 0.592057761732852, - "acc_stderr": 0.02958195251960619 - }, - "winogrande": { - "acc": 0.5351223362273086, - "acc_stderr": 0.014017773120881587 - }, - "storycloze_2016": { - "acc": 0.669695350080171, - "acc_stderr": 0.010876149841754857 - }, - "boolq": { - "acc": 0.6055045871559633, - "acc_stderr": 0.008548152025770934 - }, - "arc_easy": { - "acc": 0.571969696969697, - "acc_stderr": 0.01015294331642627, - "acc_norm": 0.5517676767676768, - "acc_norm_stderr": 0.010204645126856943 - }, - "arc_challenge": { - "acc": 0.24658703071672355, - "acc_stderr": 0.012595726268790122, - "acc_norm": 0.27986348122866894, - "acc_norm_stderr": 0.013119040897725922 - }, - "sciq": { - "acc": 0.844, - "acc_stderr": 0.011480235006122365, - "acc_norm": 0.827, - "acc_norm_stderr": 0.011967214137559941 - }, - "piqa": { - "acc": 0.719260065288357, - "acc_stderr": 0.010484325438311827, - "acc_norm": 0.7290533188248096, - "acc_norm_stderr": 0.010369718937426843 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/generation/merged.csv b/2b855b55bc4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..f6cf83543e865cdc225fc5e2fb3163225529aa0b --- /dev/null +++ b/2b855b55bc4seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00012949433318118156 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00012949433318118156 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1881691820625141 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1881691820625141 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.210504118152605 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.210504118152605 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.21907640178489668 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.21907640178489668 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.22169417479547066 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.22169417479547066 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2241886926042848 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2241886926042848 +e2e_nlg_cleaned,5,average,multiple,0.1772936772888254 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04410086560503002 +gem_xsum,0,median,rouge2_fmeasure,0.04410086560503002 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.033094435873653585 +gem_xsum,1,median,rouge2_fmeasure,0.033094435873653585 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.031586014301670295 +gem_xsum,2,median,rouge2_fmeasure,0.031586014301670295 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.02888706920552339 +gem_xsum,3,median,rouge2_fmeasure,0.02888706920552339 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.008866921103262896 +gem_xsum,4,median,rouge2_fmeasure,0.008866921103262896 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00037358459253719785 +gem_xsum,5,median,rouge2_fmeasure,0.00037358459253719785 +gem_xsum,5,average,multiple,0.024484815113612897 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04840686427764873 +web_nlg_en,0,median,rouge2_fmeasure,0.04840686427764873 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05102058176659721 +web_nlg_en,1,median,rouge2_fmeasure,0.05102058176659721 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.053915206348715154 +web_nlg_en,2,median,rouge2_fmeasure,0.053915206348715154 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05405358629973064 +web_nlg_en,3,median,rouge2_fmeasure,0.05405358629973064 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.055391697796430064 +web_nlg_en,4,median,rouge2_fmeasure,0.055391697796430064 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05681301581270846 +web_nlg_en,5,median,rouge2_fmeasure,0.05681301581270846 +web_nlg_en,5,average,multiple,0.053266825383638375 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03619473403994938 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03619473403994938 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04469035185534077 +wiki_lingua_en,1,median,rouge2_fmeasure,0.04469035185534077 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04983969411542771 +wiki_lingua_en,2,median,rouge2_fmeasure,0.04983969411542771 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0418018159186987 +wiki_lingua_en,3,median,rouge2_fmeasure,0.0418018159186987 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01374737635952299 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01374737635952299 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024634396748570817 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0024634396748570817 +wiki_lingua_en,5,average,multiple,0.03145623532729944 diff --git a/2b855b55bc4seed4/evaluation/generation/merged.json b/2b855b55bc4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..b0bb3698f0a46bee227eadb2c7386a8770cc80ac --- /dev/null +++ b/2b855b55bc4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3920085265216284, "bleu_stderr": 0.04492187114217818, "rouge1_fmeasure": 0.10189371501598563, "rouge1_fmeasure_stderr": 0.0020328857870440037, "rouge1_precision": 0.06857889458300297, "rouge1_precision_stderr": 0.0017562607043546889, "rouge1_recall": 0.2817281440443021, "rouge1_recall_stderr": 0.004593300918613426, "rouge2_fmeasure": 0.04840686427764873, "rouge2_fmeasure_stderr": 0.0012841779874209675, "rouge2_precision": 0.03213381685145334, "rouge2_precision_stderr": 0.0010249880172205844, "rouge2_recall": 0.13631315293035448, "rouge2_recall_stderr": 0.0031278804865176217, "rougeL_fmeasure": 0.09916315175982283, "rougeL_fmeasure_stderr": 0.0019216723565587482, "rougeL_precision": 0.06645683534553276, "rougeL_precision_stderr": 0.0016210719314744704, "rougeL_recall": 0.27573373586686584, "rougeL_recall_stderr": 0.0044979807979480405, "rougeLsum_fmeasure": 0.09803405933935286, "rougeLsum_fmeasure_stderr": 0.001941651251192338, "rougeLsum_precision": 0.06593112120959445, "rougeLsum_precision_stderr": 0.001658127205203516, "rougeLsum_recall": 0.27052975191621376, "rougeLsum_recall_stderr": 0.004346316470292485}}, "1": {"PALM_prompt": {"bleu": 0.4158039730252241, "bleu_stderr": 0.030123702244316028, "rouge1_fmeasure": 0.11061042217596902, "rouge1_fmeasure_stderr": 0.001896691973444101, "rouge1_precision": 0.07129757158850138, "rouge1_precision_stderr": 0.0014458021710497717, "rouge1_recall": 0.3516923777837385, "rouge1_recall_stderr": 0.005356875245938342, "rouge2_fmeasure": 0.05102058176659721, "rouge2_fmeasure_stderr": 0.0011602460644751982, "rouge2_precision": 0.03275130082403469, "rouge2_precision_stderr": 0.0008586233629897961, "rouge2_recall": 0.16935881068821518, "rouge2_recall_stderr": 0.003608981708975127, "rougeL_fmeasure": 0.10417386150743224, "rougeL_fmeasure_stderr": 0.0017061426367010373, "rougeL_precision": 0.06707429668342979, "rougeL_precision_stderr": 0.0012925108337615314, "rougeL_recall": 0.3318906125869447, "rougeL_recall_stderr": 0.004907851999244201, "rougeLsum_fmeasure": 0.10579041991582103, "rougeLsum_fmeasure_stderr": 0.0017940783403277258, "rougeLsum_precision": 0.06829301701709294, "rougeLsum_precision_stderr": 0.001382760175257422, "rougeLsum_recall": 0.3346743531396663, "rougeLsum_recall_stderr": 0.004931684816531603}}, "2": {"PALM_prompt": {"bleu": 0.4701512563527578, "bleu_stderr": 0.026886552242258145, "rouge1_fmeasure": 0.11492931379435747, "rouge1_fmeasure_stderr": 0.00182500720841709, "rouge1_precision": 0.07366177252433007, "rouge1_precision_stderr": 0.0013726468980229324, "rouge1_recall": 0.3681203028015491, "rouge1_recall_stderr": 0.00531920965231789, "rouge2_fmeasure": 0.053915206348715154, "rouge2_fmeasure_stderr": 0.0011205827272533196, "rouge2_precision": 0.034248706521075915, "rouge2_precision_stderr": 0.0008123418400239756, "rouge2_recall": 0.18329047633230322, "rouge2_recall_stderr": 0.003709569109079677, "rougeL_fmeasure": 0.10865869996300166, "rougeL_fmeasure_stderr": 0.0016386072192235228, "rougeL_precision": 0.06958113652129926, "rougeL_precision_stderr": 0.0012194555607304017, "rougeL_recall": 0.34777984849254595, "rougeL_recall_stderr": 0.004850385635822989, "rougeLsum_fmeasure": 0.11000354745933813, "rougeLsum_fmeasure_stderr": 0.0017268341107613738, "rougeLsum_precision": 0.07056999669031319, "rougeLsum_precision_stderr": 0.001309195173365324, "rougeLsum_recall": 0.35129394720828927, "rougeLsum_recall_stderr": 0.004934618921348668}}, "3": {"PALM_prompt": {"bleu": 0.45967140403214185, "bleu_stderr": 0.018915450906765267, "rouge1_fmeasure": 0.11578512741953177, "rouge1_fmeasure_stderr": 0.0018016860407642794, "rouge1_precision": 0.07357498207519365, "rouge1_precision_stderr": 0.0012894587657303798, "rouge1_recall": 0.3717033044270297, "rouge1_recall_stderr": 0.005208944391672894, "rouge2_fmeasure": 0.05405358629973064, "rouge2_fmeasure_stderr": 0.0011236260094481482, "rouge2_precision": 0.03406784688732276, "rouge2_precision_stderr": 0.0007744258825105758, "rouge2_recall": 0.18396466340570689, "rouge2_recall_stderr": 0.0036457272325732177, "rougeL_fmeasure": 0.10899222865020566, "rougeL_fmeasure_stderr": 0.001609097833839522, "rougeL_precision": 0.06917854676999842, "rougeL_precision_stderr": 0.001144310682683807, "rougeL_recall": 0.35060697176269623, "rougeL_recall_stderr": 0.00479437606074137, "rougeLsum_fmeasure": 0.11016219022181273, "rougeLsum_fmeasure_stderr": 0.0016879056223478952, "rougeLsum_precision": 0.07001260386525873, "rougeLsum_precision_stderr": 0.001208373520053658, "rougeLsum_recall": 0.35270339330869044, "rougeLsum_recall_stderr": 0.004795128025623441}}, "4": {"PALM_prompt": {"bleu": 0.5060331828932195, "bleu_stderr": 0.02020298334904258, "rouge1_fmeasure": 0.11874260954395889, "rouge1_fmeasure_stderr": 0.001841460596695692, "rouge1_precision": 0.07545696856804102, "rouge1_precision_stderr": 0.0013295069425986446, "rouge1_recall": 0.38215524532137723, "rouge1_recall_stderr": 0.005257006064249761, "rouge2_fmeasure": 0.055391697796430064, "rouge2_fmeasure_stderr": 0.0011453822407753455, "rouge2_precision": 0.03492322537063844, "rouge2_precision_stderr": 0.0007984919353744566, "rouge2_recall": 0.19076862574497166, "rouge2_recall_stderr": 0.0037625561082743546, "rougeL_fmeasure": 0.11128682788075722, "rougeL_fmeasure_stderr": 0.0016381118896473732, "rougeL_precision": 0.07068689409258942, "rougeL_precision_stderr": 0.0011773265839271868, "rougeL_recall": 0.3580436414196664, "rougeL_recall_stderr": 0.004757234146741595, "rougeLsum_fmeasure": 0.11312548316704316, "rougeLsum_fmeasure_stderr": 0.001733885665431382, "rougeLsum_precision": 0.07193943875520759, "rougeLsum_precision_stderr": 0.0012539642932541059, "rougeLsum_recall": 0.36273674650157434, "rougeLsum_recall_stderr": 0.0048322200257863}}, "5": {"PALM_prompt": {"bleu": 0.5517771828764972, "bleu_stderr": 0.022455114778975327, "rouge1_fmeasure": 0.12092453971558031, "rouge1_fmeasure_stderr": 0.0017860862123390405, "rouge1_precision": 0.07733196742080763, "rouge1_precision_stderr": 0.0014423756217437809, "rouge1_recall": 0.39563687186284324, "rouge1_recall_stderr": 0.005262394955401829, "rouge2_fmeasure": 0.05681301581270846, "rouge2_fmeasure_stderr": 0.0011142102525707102, "rouge2_precision": 0.0363153814817274, "rouge2_precision_stderr": 0.0009711142556818426, "rouge2_recall": 0.19850284382637517, "rouge2_recall_stderr": 0.0037287762140779668, "rougeL_fmeasure": 0.11243199862711403, "rougeL_fmeasure_stderr": 0.0015713072726781054, "rougeL_precision": 0.07188972950502684, "rougeL_precision_stderr": 0.001287922485352482, "rougeL_recall": 0.36834538761624536, "rougeL_recall_stderr": 0.0047412581344361895, "rougeLsum_fmeasure": 0.11514068634520519, "rougeLsum_fmeasure_stderr": 0.00167267463479654, "rougeLsum_precision": 0.07370780821547528, "rougeLsum_precision_stderr": 0.0013752123208728467, "rougeLsum_recall": 0.375663046667587, "rougeLsum_recall_stderr": 0.004839534451319088}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5603285603601398, "bleu_stderr": 0.06350845627422755, "rouge1_fmeasure": 0.180834127837345, "rouge1_fmeasure_stderr": 0.0018226393141947404, "rouge1_precision": 0.1528687710677352, "rouge1_precision_stderr": 0.0018451872799138105, "rouge1_recall": 0.26526292270500057, "rouge1_recall_stderr": 0.0026118230242705744, "rouge2_fmeasure": 0.03619473403994938, "rouge2_fmeasure_stderr": 0.0008455825671580977, "rouge2_precision": 0.030278667032869758, "rouge2_precision_stderr": 0.0007428536099716036, "rouge2_recall": 0.054923561519032764, "rouge2_recall_stderr": 0.0014005369165139574, "rougeL_fmeasure": 0.14215245134569873, "rougeL_fmeasure_stderr": 0.0013102486552159249, "rougeL_precision": 0.11861266397242905, "rougeL_precision_stderr": 0.0012853651434872903, "rougeL_recall": 0.21401464455544386, "rougeL_recall_stderr": 0.002174640168243225, "rougeLsum_fmeasure": 0.16580402959973461, "rougeLsum_fmeasure_stderr": 0.001658273632773509, "rougeLsum_precision": 0.1399088595515, "rougeLsum_precision_stderr": 0.0016719145644641183, "rougeLsum_recall": 0.24419855028337706, "rougeLsum_recall_stderr": 0.00242618837717259}}, "1": {"tldr_en": {"bleu": 2.097930395803472, "bleu_stderr": 0.0689667860676578, "rouge1_fmeasure": 0.20135585426265384, "rouge1_fmeasure_stderr": 0.0019174578686321324, "rouge1_precision": 0.1760587488200307, "rouge1_precision_stderr": 0.0021433308517692616, "rouge1_recall": 0.2905131526786404, "rouge1_recall_stderr": 0.0027720915945504535, "rouge2_fmeasure": 0.04469035185534077, "rouge2_fmeasure_stderr": 0.0009421932895556139, "rouge2_precision": 0.039904978659222545, "rouge2_precision_stderr": 0.0010175772095508803, "rouge2_recall": 0.06680449936113003, "rouge2_recall_stderr": 0.0016006720847527027, "rougeL_fmeasure": 0.14494196377677354, "rougeL_fmeasure_stderr": 0.0012962613611388854, "rougeL_precision": 0.12579245898283767, "rougeL_precision_stderr": 0.0014926148052554079, "rougeL_recall": 0.21449279116615919, "rougeL_recall_stderr": 0.002174304849809369, "rougeLsum_fmeasure": 0.18785008436119408, "rougeLsum_fmeasure_stderr": 0.0017766751226834835, "rougeLsum_precision": 0.16415512753246936, "rougeLsum_precision_stderr": 0.0020043159310119177, "rougeLsum_recall": 0.2719690603001882, "rougeLsum_recall_stderr": 0.0026092075637484838}}, "2": {"tldr_en": {"bleu": 2.3842205117969226, "bleu_stderr": 0.059205731273566334, "rouge1_fmeasure": 0.2088261040326107, "rouge1_fmeasure_stderr": 0.001874377658857748, "rouge1_precision": 0.1872457835871874, "rouge1_precision_stderr": 0.00230331776183728, "rouge1_recall": 0.29922347126772186, "rouge1_recall_stderr": 0.0027968309556055853, "rouge2_fmeasure": 0.04983969411542771, "rouge2_fmeasure_stderr": 0.0009935875148833514, "rouge2_precision": 0.04599328371626012, "rouge2_precision_stderr": 0.0012291125272701046, "rouge2_recall": 0.07394818173841343, "rouge2_recall_stderr": 0.0016664518922865333, "rougeL_fmeasure": 0.15216674790614837, "rougeL_fmeasure_stderr": 0.0012972236565033947, "rougeL_precision": 0.13670976328270193, "rougeL_precision_stderr": 0.0017794276496590619, "rougeL_recall": 0.22254239577892718, "rougeL_recall_stderr": 0.0022236593739890227, "rougeLsum_fmeasure": 0.19504739622133885, "rougeLsum_fmeasure_stderr": 0.0017523823856964663, "rougeLsum_precision": 0.17497992453299946, "rougeLsum_precision_stderr": 0.002185933512530162, "rougeLsum_recall": 0.27995918850837626, "rougeLsum_recall_stderr": 0.0026384312744789275}}, "3": {"tldr_en": {"bleu": 2.40869915705471, "bleu_stderr": 0.07789335887028026, "rouge1_fmeasure": 0.17629641026596565, "rouge1_fmeasure_stderr": 0.0022136947539241033, "rouge1_precision": 0.1662142429985654, "rouge1_precision_stderr": 0.0026416468785387627, "rouge1_recall": 0.24986977548157369, "rouge1_recall_stderr": 0.0032890181005730927, "rouge2_fmeasure": 0.0418018159186987, "rouge2_fmeasure_stderr": 0.0009624306787287241, "rouge2_precision": 0.04046636002020674, "rouge2_precision_stderr": 0.0012207729469165699, "rouge2_recall": 0.06211333145799978, "rouge2_recall_stderr": 0.0016286548438397969, "rougeL_fmeasure": 0.12902698967219026, "rougeL_fmeasure_stderr": 0.001580227832203032, "rougeL_precision": 0.1226972341445923, "rougeL_precision_stderr": 0.0020762732595340627, "rougeL_recall": 0.18653239637175842, "rougeL_recall_stderr": 0.0025838673896806832, "rougeLsum_fmeasure": 0.164404065089956, "rougeLsum_fmeasure_stderr": 0.0020640920584984283, "rougeLsum_precision": 0.15527790758794774, "rougeLsum_precision_stderr": 0.0024991033262282037, "rougeLsum_recall": 0.23361194870669094, "rougeLsum_recall_stderr": 0.003112699573611116}}, "4": {"tldr_en": {"bleu": 0.5806142687719986, "bleu_stderr": 0.033729117845062905, "rouge1_fmeasure": 0.05691280089073495, "rouge1_fmeasure_stderr": 0.0019326658039671163, "rouge1_precision": 0.055601036643273365, "rouge1_precision_stderr": 0.0021459337470785093, "rouge1_recall": 0.0833227661323105, "rouge1_recall_stderr": 0.002898415590588721, "rouge2_fmeasure": 0.01374737635952299, "rouge2_fmeasure_stderr": 0.000674715701287374, "rouge2_precision": 0.013174293359221814, "rouge2_precision_stderr": 0.0007663061308797175, "rouge2_recall": 0.021158499641918186, "rouge2_recall_stderr": 0.0011331324661809316, "rougeL_fmeasure": 0.04279480754327154, "rougeL_fmeasure_stderr": 0.0014331989422404967, "rougeL_precision": 0.042372375257670535, "rougeL_precision_stderr": 0.0016983444280256317, "rougeL_recall": 0.0638591512565295, "rougeL_recall_stderr": 0.0022591164661767103, "rougeLsum_fmeasure": 0.0531726497206721, "rougeLsum_fmeasure_stderr": 0.0018026918935230168, "rougeLsum_precision": 0.0520305157406117, "rougeLsum_precision_stderr": 0.0020156930329625258, "rougeLsum_recall": 0.0780322620667648, "rougeLsum_recall_stderr": 0.00272132644178967}}, "5": {"tldr_en": {"bleu": 6.94585976161408e-07, "bleu_stderr": 1.3174488185237594e-06, "rouge1_fmeasure": 0.009182088826144953, "rouge1_fmeasure_stderr": 0.000884490457342969, "rouge1_precision": 0.009767179926120667, "rouge1_precision_stderr": 0.0010894983188808576, "rouge1_recall": 0.013172409933555533, "rouge1_recall_stderr": 0.0012864103797058989, "rouge2_fmeasure": 0.0024634396748570817, "rouge2_fmeasure_stderr": 0.0003162426008804357, "rouge2_precision": 0.002984734311253215, "rouge2_precision_stderr": 0.0005741800101122759, "rouge2_recall": 0.003524648860109897, "rouge2_recall_stderr": 0.0004637356763642486, "rougeL_fmeasure": 0.007020836626994571, "rougeL_fmeasure_stderr": 0.0006722388051045242, "rougeL_precision": 0.007618136584991738, "rougeL_precision_stderr": 0.0009019992256032961, "rougeL_recall": 0.010272433535537005, "rougeL_recall_stderr": 0.00101610174570616, "rougeLsum_fmeasure": 0.008489947637733378, "rougeLsum_fmeasure_stderr": 0.0008183438804434729, "rougeLsum_precision": 0.009073055796683023, "rougeLsum_precision_stderr": 0.0010296525618830722, "rougeLsum_recall": 0.012283249158742512, "rougeLsum_recall_stderr": 0.0012125014496237507}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.015992960640365678, "bleu_stderr": 0.004309964226158169, "rouge1_fmeasure": 0.01578340588233676, "rouge1_fmeasure_stderr": 0.0003249653469660021, "rouge1_precision": 0.012587301587302022, "rouge1_precision_stderr": 0.00026104902267950775, "rouge1_recall": 0.022364589648673473, "rouge1_recall_stderr": 0.0004833744939072677, "rouge2_fmeasure": 0.00012949433318118156, "rouge2_fmeasure_stderr": 3.494334474550083e-05, "rouge2_precision": 0.00011382113821138212, "rouge2_precision_stderr": 3.0353975217624302e-05, "rouge2_recall": 0.00016007109878338666, "rouge2_recall_stderr": 4.471587603586543e-05, "rougeL_fmeasure": 0.01578340588233676, "rougeL_fmeasure_stderr": 0.0003249653469660021, "rougeL_precision": 0.012587301587302022, "rougeL_precision_stderr": 0.00026104902267950775, "rougeL_recall": 0.022364589648673473, "rougeL_recall_stderr": 0.0004833744939072677, "rougeLsum_fmeasure": 0.015277406220358457, "rougeLsum_fmeasure_stderr": 0.0003078809116857247, "rougeLsum_precision": 0.012182539682540112, "rougeLsum_precision_stderr": 0.00024742509174155176, "rougeLsum_recall": 0.02165931137163564, "rougeLsum_recall_stderr": 0.000459555205796444}}, "1": {"generate_text_restaurant": {"bleu": 10.350327569945339, "bleu_stderr": 0.11718934671553544, "rouge1_fmeasure": 0.4266527751181952, "rouge1_fmeasure_stderr": 0.0022501365015843663, "rouge1_precision": 0.5042765618698236, "rouge1_precision_stderr": 0.0030796975247849904, "rouge1_recall": 0.4086484381469572, "rouge1_recall_stderr": 0.002887067833722799, "rouge2_fmeasure": 0.1881691820625141, "rouge2_fmeasure_stderr": 0.0018488755018199615, "rouge2_precision": 0.2251224844281287, "rouge2_precision_stderr": 0.002371088222767797, "rouge2_recall": 0.18037876413972526, "rouge2_recall_stderr": 0.0020138713692968466, "rougeL_fmeasure": 0.30398486941890124, "rougeL_fmeasure_stderr": 0.0019214834640056062, "rougeL_precision": 0.3621120282505737, "rougeL_precision_stderr": 0.0026955264006648123, "rougeL_recall": 0.2904816198239033, "rougeL_recall_stderr": 0.0023197870808026218, "rougeLsum_fmeasure": 0.34389419410927863, "rougeLsum_fmeasure_stderr": 0.0021635114388199655, "rougeLsum_precision": 0.40810791264875373, "rougeLsum_precision_stderr": 0.002934712748248161, "rougeLsum_recall": 0.3288368303483746, "rougeLsum_recall_stderr": 0.0025988682840544196}}, "2": {"generate_text_restaurant": {"bleu": 12.009238583502029, "bleu_stderr": 0.17380879614541764, "rouge1_fmeasure": 0.450266387162706, "rouge1_fmeasure_stderr": 0.002196448888042002, "rouge1_precision": 0.5313179418828156, "rouge1_precision_stderr": 0.0031912799115298347, "rouge1_recall": 0.4306386051523278, "rouge1_recall_stderr": 0.002840079037100073, "rouge2_fmeasure": 0.210504118152605, "rouge2_fmeasure_stderr": 0.0019038740164675245, "rouge2_precision": 0.25190096771908355, "rouge2_precision_stderr": 0.002542588214953114, "rouge2_recall": 0.2015739727601574, "rouge2_recall_stderr": 0.002088704993602566, "rougeL_fmeasure": 0.3281593860445986, "rougeL_fmeasure_stderr": 0.001959160818708038, "rougeL_precision": 0.38916711331825293, "rougeL_precision_stderr": 0.0028218758182783914, "rougeL_recall": 0.3134689624106426, "rougeL_recall_stderr": 0.0023628417621556967, "rougeLsum_fmeasure": 0.3705639247316313, "rougeLsum_fmeasure_stderr": 0.002174314297816321, "rougeLsum_precision": 0.4383418647272363, "rougeLsum_precision_stderr": 0.0030571625774671114, "rougeLsum_recall": 0.3539890288952586, "rougeLsum_recall_stderr": 0.002619620863090507}}, "3": {"generate_text_restaurant": {"bleu": 12.73052610748074, "bleu_stderr": 0.20845475524145052, "rouge1_fmeasure": 0.4553140890501533, "rouge1_fmeasure_stderr": 0.002165377484383932, "rouge1_precision": 0.5402185326568262, "rouge1_precision_stderr": 0.0031993997576614903, "rouge1_recall": 0.43359544071504663, "rouge1_recall_stderr": 0.0028040389355841076, "rouge2_fmeasure": 0.21907640178489668, "rouge2_fmeasure_stderr": 0.001939721876620404, "rouge2_precision": 0.26320131487991905, "rouge2_precision_stderr": 0.0025983039935128876, "rouge2_recall": 0.209198127692882, "rouge2_recall_stderr": 0.002139048542719516, "rougeL_fmeasure": 0.3353918471796088, "rougeL_fmeasure_stderr": 0.001998916793448828, "rougeL_precision": 0.3994337580544103, "rougeL_precision_stderr": 0.002886796888438663, "rougeL_recall": 0.3194631306079281, "rougeL_recall_stderr": 0.0024107987274759025, "rougeLsum_fmeasure": 0.37755919922763853, "rougeLsum_fmeasure_stderr": 0.002211654989797948, "rougeLsum_precision": 0.44829249425155315, "rougeLsum_precision_stderr": 0.0030950771464680208, "rougeLsum_recall": 0.3596781386109394, "rougeLsum_recall_stderr": 0.0026736258676108215}}, "4": {"generate_text_restaurant": {"bleu": 12.93009423211036, "bleu_stderr": 0.13316099251287217, "rouge1_fmeasure": 0.4578765621783617, "rouge1_fmeasure_stderr": 0.002148337444450519, "rouge1_precision": 0.544507581717304, "rouge1_precision_stderr": 0.003212359627271242, "rouge1_recall": 0.4332806699440339, "rouge1_recall_stderr": 0.002727419772457572, "rouge2_fmeasure": 0.22169417479547066, "rouge2_fmeasure_stderr": 0.0019729492396498515, "rouge2_precision": 0.26782026652253177, "rouge2_precision_stderr": 0.0026655934928106963, "rouge2_recall": 0.2099210989590802, "rouge2_recall_stderr": 0.002138334879435022, "rougeL_fmeasure": 0.340381517886375, "rougeL_fmeasure_stderr": 0.0020334866345106356, "rougeL_precision": 0.4053787167820524, "rougeL_precision_stderr": 0.0028911428008711014, "rougeL_recall": 0.32261903206487497, "rougeL_recall_stderr": 0.002426675616112507, "rougeLsum_fmeasure": 0.3825880397440554, "rougeLsum_fmeasure_stderr": 0.0022230837642077, "rougeLsum_precision": 0.45459079911416955, "rougeLsum_precision_stderr": 0.003097274562790854, "rougeLsum_recall": 0.3626800084342629, "rougeLsum_recall_stderr": 0.0026684199555404684}}, "5": {"generate_text_restaurant": {"bleu": 12.942335106475074, "bleu_stderr": 0.12958427262479125, "rouge1_fmeasure": 0.46024031213694333, "rouge1_fmeasure_stderr": 0.002122360256031269, "rouge1_precision": 0.5513271856284667, "rouge1_precision_stderr": 0.003206864680770012, "rouge1_recall": 0.431920944284737, "rouge1_recall_stderr": 0.002694961747047277, "rouge2_fmeasure": 0.2241886926042848, "rouge2_fmeasure_stderr": 0.0019502452160009272, "rouge2_precision": 0.27316362353166984, "rouge2_precision_stderr": 0.002696797569352779, "rouge2_recall": 0.2102662896838171, "rouge2_recall_stderr": 0.002076931826457019, "rougeL_fmeasure": 0.34400784826968916, "rougeL_fmeasure_stderr": 0.002029778497728368, "rougeL_precision": 0.4130351165572696, "rougeL_precision_stderr": 0.0029386904986997463, "rougeL_recall": 0.3230630442881531, "rougeL_recall_stderr": 0.0023857184508629115, "rougeLsum_fmeasure": 0.38556552769766467, "rougeLsum_fmeasure_stderr": 0.0022060350279482803, "rougeLsum_precision": 0.4617402458286559, "rougeLsum_precision_stderr": 0.0031279078864989898, "rougeLsum_recall": 0.3622147621157937, "rougeLsum_recall_stderr": 0.0026083274394287895}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.7655886076372085, "bleu_stderr": 0.10736235981174763, "rouge1_fmeasure": 0.20350913595442885, "rouge1_fmeasure_stderr": 0.002435224602899981, "rouge1_precision": 0.1492164696849081, "rouge1_precision_stderr": 0.001996525925599697, "rouge1_recall": 0.34446883281449314, "rouge1_recall_stderr": 0.0041971721126418384, "rouge2_fmeasure": 0.04410086560503002, "rouge2_fmeasure_stderr": 0.0014787510862099547, "rouge2_precision": 0.03188261389262752, "rouge2_precision_stderr": 0.001117793719292304, "rouge2_recall": 0.07731292695855164, "rouge2_recall_stderr": 0.0026753338382779343, "rougeL_fmeasure": 0.15384187542412775, "rougeL_fmeasure_stderr": 0.001805266694766897, "rougeL_precision": 0.11261049980189601, "rougeL_precision_stderr": 0.0014872078652013153, "rougeL_recall": 0.26208578523120984, "rougeL_recall_stderr": 0.0032624535599867833, "rougeLsum_fmeasure": 0.1599245456828896, "rougeLsum_fmeasure_stderr": 0.002084477270792375, "rougeLsum_precision": 0.11697579245444102, "rougeLsum_precision_stderr": 0.0016691184699523902, "rougeLsum_recall": 0.2724297544641497, "rougeLsum_recall_stderr": 0.003726544122785748}}, "1": {"article_DOC_summary": {"bleu": 1.334942775777946, "bleu_stderr": 0.10233928786607425, "rouge1_fmeasure": 0.16876877482468633, "rouge1_fmeasure_stderr": 0.0024452295649194868, "rouge1_precision": 0.12002124185701755, "rouge1_precision_stderr": 0.0018175183259017203, "rouge1_recall": 0.2964576588584416, "rouge1_recall_stderr": 0.004170424931758962, "rouge2_fmeasure": 0.033094435873653585, "rouge2_fmeasure_stderr": 0.0013682847216061845, "rouge2_precision": 0.023305600673673897, "rouge2_precision_stderr": 0.0009690705744659361, "rouge2_recall": 0.05951174562054147, "rouge2_recall_stderr": 0.0025195976023397624, "rougeL_fmeasure": 0.1345498469097093, "rougeL_fmeasure_stderr": 0.0018375498452298036, "rougeL_precision": 0.09544491814686483, "rougeL_precision_stderr": 0.0013491148935310148, "rougeL_recall": 0.2381310181067947, "rougeL_recall_stderr": 0.0033031057668250915, "rougeLsum_fmeasure": 0.13375301216657454, "rougeLsum_fmeasure_stderr": 0.00200383702286692, "rougeLsum_precision": 0.09484008472273599, "rougeLsum_precision_stderr": 0.0014628305860636154, "rougeLsum_recall": 0.2369080381179775, "rougeLsum_recall_stderr": 0.003585080900294894}}, "2": {"article_DOC_summary": {"bleu": 1.1932725991547866, "bleu_stderr": 0.10300509523400306, "rouge1_fmeasure": 0.16609014549719567, "rouge1_fmeasure_stderr": 0.0024607806322175176, "rouge1_precision": 0.11821820446249875, "rouge1_precision_stderr": 0.0018185864766111957, "rouge1_recall": 0.2908025055532656, "rouge1_recall_stderr": 0.0042529606032905, "rouge2_fmeasure": 0.031586014301670295, "rouge2_fmeasure_stderr": 0.0013106614943761185, "rouge2_precision": 0.022254316677808094, "rouge2_precision_stderr": 0.0009242323850632092, "rouge2_recall": 0.05666861623781562, "rouge2_recall_stderr": 0.002430545087624139, "rougeL_fmeasure": 0.134717900004207, "rougeL_fmeasure_stderr": 0.001903412284840825, "rougeL_precision": 0.09574752179314858, "rougeL_precision_stderr": 0.0013963865884851763, "rougeL_recall": 0.23706400212234835, "rougeL_recall_stderr": 0.00341363560974952, "rougeLsum_fmeasure": 0.12983954819837065, "rougeLsum_fmeasure_stderr": 0.0019882119195634884, "rougeLsum_precision": 0.09209325512314095, "rougeLsum_precision_stderr": 0.00144071361484595, "rougeLsum_recall": 0.2294692388656739, "rougeLsum_recall_stderr": 0.003615102737414297}}, "3": {"article_DOC_summary": {"bleu": 1.163112466527613, "bleu_stderr": 0.12085241458018103, "rouge1_fmeasure": 0.15539332911319093, "rouge1_fmeasure_stderr": 0.0024835494495013275, "rouge1_precision": 0.11341845455160869, "rouge1_precision_stderr": 0.001959503450855617, "rouge1_recall": 0.2667644085257336, "rouge1_recall_stderr": 0.0042849481116954856, "rouge2_fmeasure": 0.02888706920552339, "rouge2_fmeasure_stderr": 0.0012878557813863784, "rouge2_precision": 0.020639263292355745, "rouge2_precision_stderr": 0.000928784117298754, "rouge2_recall": 0.05152062271963752, "rouge2_recall_stderr": 0.002396458240143405, "rougeL_fmeasure": 0.12863266229499296, "rougeL_fmeasure_stderr": 0.0019430115068211663, "rougeL_precision": 0.09372416249129488, "rougeL_precision_stderr": 0.0015179579727989097, "rougeL_recall": 0.22198783584917942, "rougeL_recall_stderr": 0.0034823484122581258, "rougeLsum_fmeasure": 0.12209298471375925, "rougeLsum_fmeasure_stderr": 0.002026617243090104, "rougeLsum_precision": 0.08892751977845476, "rougeLsum_precision_stderr": 0.001571905443563374, "rougeLsum_recall": 0.21138046965039992, "rougeLsum_recall_stderr": 0.003639708461716417}}, "4": {"article_DOC_summary": {"bleu": 0.6927044824583349, "bleu_stderr": 0.12209299962837876, "rouge1_fmeasure": 0.044764900055804344, "rouge1_fmeasure_stderr": 0.0025907021244305356, "rouge1_precision": 0.0385284188334343, "rouge1_precision_stderr": 0.0024541291074536404, "rouge1_recall": 0.06873563673644689, "rouge1_recall_stderr": 0.004047379056109321, "rouge2_fmeasure": 0.008866921103262896, "rouge2_fmeasure_stderr": 0.0009234537826135504, "rouge2_precision": 0.007182287752001923, "rouge2_precision_stderr": 0.0008371291250086691, "rouge2_recall": 0.0141987217002901, "rouge2_recall_stderr": 0.0014723708303510416, "rougeL_fmeasure": 0.037047054109218464, "rougeL_fmeasure_stderr": 0.0021110962657516935, "rougeL_precision": 0.032040531079265175, "rougeL_precision_stderr": 0.002047736410529299, "rougeL_recall": 0.057305380660103995, "rougeL_recall_stderr": 0.003353875117836262, "rougeLsum_fmeasure": 0.03573621465314158, "rougeLsum_fmeasure_stderr": 0.0020879383106949334, "rougeLsum_precision": 0.03131602365914373, "rougeLsum_precision_stderr": 0.0020836122909593767, "rougeLsum_recall": 0.05467024316761279, "rougeLsum_recall_stderr": 0.003252765338873642}}, "5": {"article_DOC_summary": {"bleu": 1.3405167133155682e-36, "bleu_stderr": 2.672073490429972e-32, "rouge1_fmeasure": 0.0025194661352948475, "rouge1_fmeasure_stderr": 0.0007719967235096721, "rouge1_precision": 0.0028080565772603864, "rouge1_precision_stderr": 0.0008488029691177819, "rouge1_recall": 0.002386805100318374, "rouge1_recall_stderr": 0.0007460459832752868, "rouge2_fmeasure": 0.00037358459253719785, "rouge2_fmeasure_stderr": 0.00022190955128697563, "rouge2_precision": 0.0004049933295216314, "rouge2_precision_stderr": 0.00023122921367252418, "rouge2_recall": 0.0003590286886624289, "rouge2_recall_stderr": 0.00022142727159201034, "rougeL_fmeasure": 0.0018917419847305783, "rougeL_fmeasure_stderr": 0.0005432972804744207, "rougeL_precision": 0.0021247816040882227, "rougeL_precision_stderr": 0.000618261810698163, "rougeL_recall": 0.001790584534209606, "rougeL_recall_stderr": 0.0005178087830381592, "rougeLsum_fmeasure": 0.0018849028066356447, "rougeLsum_fmeasure_stderr": 0.0005477474195112749, "rougeLsum_precision": 0.0021247816040882222, "rougeLsum_precision_stderr": 0.0006264477711098739, "rougeLsum_recall": 0.0017778221393717397, "rougeLsum_recall_stderr": 0.0005184694595843492}}}} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_0.csv b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..942b680dda7c9c537793562bb676e05f1a4b92a1 --- /dev/null +++ b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203934,0 +anli_r2,acc,0.335,0.014933117490932572,0 +anli_r3,acc,0.3358333333333333,0.01363926119093288,0 +arc_challenge,acc,0.2627986348122867,0.012862523175351333,0 +arc_challenge,acc_norm,0.27986348122866894,0.013119040897725925,0 +arc_easy,acc,0.5711279461279462,0.010155440652900152,0 +arc_easy,acc_norm,0.5008417508417509,0.010259768981815232,0 +boolq,acc,0.5850152905198777,0.008617716361921567,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.2126984126984127,,1 +copa,acc,0.69,0.04648231987117316,0 +hellaswag,acc,0.44124676359290976,0.004955212787832387,0 +hellaswag,acc_norm,0.5688109938259311,0.00494230276800211,0 +piqa,acc,0.7393906420021763,0.010241826155811623,0 +piqa,acc_norm,0.7475516866158868,0.010135665547362354,0 +rte,acc,0.5487364620938628,0.029953149241808946,0 +sciq,acc,0.823,0.012075463420375063,0 +sciq,acc_norm,0.737,0.013929286594259727,0 +storycloze_2016,acc,0.6932121859967931,0.010664275190473634,0 +winogrande,acc,0.5548539857932123,0.013967662954355484,0 diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_0_lm-eval_global_step52452_2023-02-01-23-55-44_0shots_backup.json b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_0_lm-eval_global_step52452_2023-02-01-23-55-44_0shots_backup.json deleted file mode 100644 index 3a392b76b2f172770cfb8eac78a827c12ae707eb..0000000000000000000000000000000000000000 --- a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_0_lm-eval_global_step52452_2023-02-01-23-55-44_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203934 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.014933117490932572 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.01363926119093288 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.2126984126984127 - }, - "copa": { - "acc": 0.69, - "acc_stderr": 0.04648231987117316 - }, - "hellaswag": { - "acc": 0.44124676359290976, - "acc_stderr": 0.004955212787832387, - "acc_norm": 0.5688109938259311, - "acc_norm_stderr": 0.00494230276800211 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808946 - }, - "winogrande": { - "acc": 0.5548539857932123, - "acc_stderr": 0.013967662954355484 - }, - "storycloze_2016": { - "acc": 0.6932121859967931, - "acc_stderr": 0.010664275190473634 - }, - "boolq": { - "acc": 0.5850152905198777, - "acc_stderr": 0.008617716361921567 - }, - "arc_easy": { - "acc": 0.5711279461279462, - "acc_stderr": 0.010155440652900152, - "acc_norm": 0.5008417508417509, - "acc_norm_stderr": 0.010259768981815232 - }, - "arc_challenge": { - "acc": 0.2627986348122867, - "acc_stderr": 0.012862523175351333, - "acc_norm": 0.27986348122866894, - "acc_norm_stderr": 0.013119040897725925 - }, - "sciq": { - "acc": 0.823, - "acc_stderr": 0.012075463420375063, - "acc_norm": 0.737, - "acc_norm_stderr": 0.013929286594259727 - }, - "piqa": { - "acc": 0.7393906420021763, - "acc_stderr": 0.010241826155811623, - "acc_norm": 0.7475516866158868, - "acc_norm_stderr": 0.010135665547362354 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_1.csv b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..07db16ee3da77e86c6a1fbc49cf8979ae5f96c25 --- /dev/null +++ b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.337,0.014955087918653602,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.26621160409556316,0.012915774781523203,0 +arc_challenge,acc_norm,0.28668941979522183,0.01321498632927477,0 +arc_easy,acc,0.5740740740740741,0.010146568651002255,0 +arc_easy,acc_norm,0.5475589225589226,0.010213265860171392,0 +boolq,acc,0.5972477064220183,0.008578054401368407,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.3395061728395062,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.43596893049193386,0.004948696280312422,0 +hellaswag,acc_norm,0.5685122485560645,0.004942716091996087,0 +piqa,acc,0.735582154515778,0.010289787244767163,0 +piqa,acc_norm,0.7475516866158868,0.01013566554736235,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.874,0.010499249222408033,0 +sciq,acc_norm,0.846,0.0114199130650987,0 +storycloze_2016,acc,0.6825227151256013,0.010764505409830937,0 +winogrande,acc,0.5564325177584846,0.013962694907620404,0 diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_1_lm-eval_global_step52452_2023-02-01-23-55-44_1shots_backup.json b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_1_lm-eval_global_step52452_2023-02-01-23-55-44_1shots_backup.json deleted file mode 100644 index 46d53dcb0f55ba568e988f16c6b3189cfdf4a6b7..0000000000000000000000000000000000000000 --- a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_1_lm-eval_global_step52452_2023-02-01-23-55-44_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653602 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.3395061728395062 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.43596893049193386, - "acc_stderr": 0.004948696280312422, - "acc_norm": 0.5685122485560645, - "acc_norm_stderr": 0.004942716091996087 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5564325177584846, - "acc_stderr": 0.013962694907620404 - }, - "storycloze_2016": { - "acc": 0.6825227151256013, - "acc_stderr": 0.010764505409830937 - }, - "boolq": { - "acc": 0.5972477064220183, - "acc_stderr": 0.008578054401368407 - }, - "arc_easy": { - "acc": 0.5740740740740741, - "acc_stderr": 0.010146568651002255, - "acc_norm": 0.5475589225589226, - "acc_norm_stderr": 0.010213265860171392 - }, - "arc_challenge": { - "acc": 0.26621160409556316, - "acc_stderr": 0.012915774781523203, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.01321498632927477 - }, - "sciq": { - "acc": 0.874, - "acc_stderr": 0.010499249222408033, - "acc_norm": 0.846, - "acc_norm_stderr": 0.0114199130650987 - }, - "piqa": { - "acc": 0.735582154515778, - "acc_stderr": 0.010289787244767163, - "acc_norm": 0.7475516866158868, - "acc_norm_stderr": 0.01013566554736235 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_2.csv b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..9b5789b92a84d81f5b15287c967a236a78559cf4 --- /dev/null +++ b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738863,0 +anli_r2,acc,0.321,0.014770821817934642,0 +anli_r3,acc,0.3466666666666667,0.013744022550571947,0 +arc_challenge,acc,0.27559726962457337,0.013057169655761843,0 +arc_challenge,acc_norm,0.28071672354948807,0.01313123812697558,0 +arc_easy,acc,0.5841750841750841,0.010113348244647869,0 +arc_easy,acc_norm,0.5791245791245792,0.010130502164066333,0 +boolq,acc,0.590519877675841,0.008600549751320926,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2787878787878788,,1 +copa,acc,0.74,0.044084400227680794,0 +hellaswag,acc,0.4342760406293567,0.004946485466544626,0 +hellaswag,acc_norm,0.563433578968333,0.004949462563681335,0 +piqa,acc,0.7410228509249184,0.010220966031405616,0 +piqa,acc_norm,0.7431991294885746,0.01019286480227804,0 +rte,acc,0.51985559566787,0.030072723167317184,0 +sciq,acc,0.887,0.010016552866696853,0 +sciq,acc_norm,0.87,0.010640169792499361,0 +storycloze_2016,acc,0.6910742918225548,0.010684853966268452,0 +winogrande,acc,0.5588003157063931,0.013954975072834726,0 diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_2_lm-eval_global_step52452_2023-02-01-23-55-44_2shots_backup.json b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_2_lm-eval_global_step52452_2023-02-01-23-55-44_2shots_backup.json deleted file mode 100644 index b016b74d3f10542cd5aff3176cacb4bfd4962883..0000000000000000000000000000000000000000 --- a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_2_lm-eval_global_step52452_2023-02-01-23-55-44_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738863 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.014770821817934642 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.013744022550571947 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.2787878787878788 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.044084400227680794 - }, - "hellaswag": { - "acc": 0.4342760406293567, - "acc_stderr": 0.004946485466544626, - "acc_norm": 0.563433578968333, - "acc_norm_stderr": 0.004949462563681335 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317184 - }, - "winogrande": { - "acc": 0.5588003157063931, - "acc_stderr": 0.013954975072834726 - }, - "storycloze_2016": { - "acc": 0.6910742918225548, - "acc_stderr": 0.010684853966268452 - }, - "boolq": { - "acc": 0.590519877675841, - "acc_stderr": 0.008600549751320926 - }, - "arc_easy": { - "acc": 0.5841750841750841, - "acc_stderr": 0.010113348244647869, - "acc_norm": 0.5791245791245792, - "acc_norm_stderr": 0.010130502164066333 - }, - "arc_challenge": { - "acc": 0.27559726962457337, - "acc_stderr": 0.013057169655761843, - "acc_norm": 0.28071672354948807, - "acc_norm_stderr": 0.01313123812697558 - }, - "sciq": { - "acc": 0.887, - "acc_stderr": 0.010016552866696853, - "acc_norm": 0.87, - "acc_norm_stderr": 0.010640169792499361 - }, - "piqa": { - "acc": 0.7410228509249184, - "acc_stderr": 0.010220966031405616, - "acc_norm": 0.7431991294885746, - "acc_norm_stderr": 0.01019286480227804 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_3.csv b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..1c257f3d46597fbf9d8c36b929c456da119c1e42 --- /dev/null +++ b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.014830507204541031,0 +anli_r2,acc,0.354,0.015129868238451772,0 +anli_r3,acc,0.35,0.013774667009018552,0 +arc_challenge,acc,0.2721843003412969,0.013006600406423707,0 +arc_challenge,acc_norm,0.2738907849829352,0.013032004972989503,0 +arc_easy,acc,0.5913299663299664,0.010087174498762883,0 +arc_easy,acc_norm,0.5791245791245792,0.010130502164066333,0 +boolq,acc,0.5856269113149847,0.00861586377642113,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.27240896358543415,,1 +copa,acc,0.74,0.04408440022768078,0 +hellaswag,acc,0.43596893049193386,0.00494869628031242,0 +hellaswag,acc_norm,0.5700059749053973,0.004940631135803532,0 +piqa,acc,0.7388465723612623,0.01024873864993558,0 +piqa,acc_norm,0.7404787812840044,0.010227939888173923,0 +rte,acc,0.5415162454873647,0.02999253538537331,0 +sciq,acc,0.891,0.00985982840703719,0 +sciq,acc_norm,0.877,0.010391293421849877,0 +storycloze_2016,acc,0.6910742918225548,0.010684853966268445,0 +winogrande,acc,0.5548539857932123,0.013967662954355486,0 diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_3_lm-eval_global_step52452_2023-02-01-23-55-44_3shots_backup.json b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_3_lm-eval_global_step52452_2023-02-01-23-55-44_3shots_backup.json deleted file mode 100644 index 57d62297807944f334a52056d998becc1eb837f9..0000000000000000000000000000000000000000 --- a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_3_lm-eval_global_step52452_2023-02-01-23-55-44_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.014830507204541031 - }, - "anli_r2": { - "acc": 0.354, - "acc_stderr": 0.015129868238451772 - }, - "anli_r3": { - "acc": 0.35, - "acc_stderr": 0.013774667009018552 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.27240896358543415 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768078 - }, - "hellaswag": { - "acc": 0.43596893049193386, - "acc_stderr": 0.00494869628031242, - "acc_norm": 0.5700059749053973, - "acc_norm_stderr": 0.004940631135803532 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.02999253538537331 - }, - "winogrande": { - "acc": 0.5548539857932123, - "acc_stderr": 0.013967662954355486 - }, - "storycloze_2016": { - "acc": 0.6910742918225548, - "acc_stderr": 0.010684853966268445 - }, - "boolq": { - "acc": 0.5856269113149847, - "acc_stderr": 0.00861586377642113 - }, - "arc_easy": { - "acc": 0.5913299663299664, - "acc_stderr": 0.010087174498762883, - "acc_norm": 0.5791245791245792, - "acc_norm_stderr": 0.010130502164066333 - }, - "arc_challenge": { - "acc": 0.2721843003412969, - "acc_stderr": 0.013006600406423707, - "acc_norm": 0.2738907849829352, - "acc_norm_stderr": 0.013032004972989503 - }, - "sciq": { - "acc": 0.891, - "acc_stderr": 0.00985982840703719, - "acc_norm": 0.877, - "acc_norm_stderr": 0.010391293421849877 - }, - "piqa": { - "acc": 0.7388465723612623, - "acc_stderr": 0.01024873864993558, - "acc_norm": 0.7404787812840044, - "acc_norm_stderr": 0.010227939888173923 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_4.csv b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..7b3d0e024ceaaa243fa598f0ad25ea7094182b38 --- /dev/null +++ b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.355,0.015139491543780529,0 +anli_r2,acc,0.34,0.014987482264363937,0 +anli_r3,acc,0.34,0.01368049572576779,0 +arc_challenge,acc,0.2773037542662116,0.013082095839059373,0 +arc_challenge,acc_norm,0.28668941979522183,0.013214986329274767,0 +arc_easy,acc,0.5925925925925926,0.010082326627832865,0 +arc_easy,acc_norm,0.5824915824915825,0.010119187377776038,0 +boolq,acc,0.5840978593272171,0.008620469604001021,1 +cb,acc,0.5535714285714286,0.06703189227942395,1 +cb,f1,0.3904761904761904,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.433877713602868,0.00494595674494381,0 +hellaswag,acc_norm,0.5678151762597092,0.0049436733882762636,0 +piqa,acc,0.736126224156692,0.010282996367695564,0 +piqa,acc_norm,0.7442872687704026,0.010178690109459878,0 +rte,acc,0.5054151624548736,0.030094698123239966,0 +sciq,acc,0.889,0.009938701010583726,0 +sciq,acc_norm,0.884,0.010131468138757002,0 +storycloze_2016,acc,0.6980224478888295,0.010616985436073357,0 +winogrande,acc,0.5548539857932123,0.013967662954355482,0 diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_4_lm-eval_global_step52452_2023-02-01-23-55-44_4shots_backup.json b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_4_lm-eval_global_step52452_2023-02-01-23-55-44_4shots_backup.json deleted file mode 100644 index 1ffc139a2a310e3b1995d90f8f7b5378b6d06482..0000000000000000000000000000000000000000 --- a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_4_lm-eval_global_step52452_2023-02-01-23-55-44_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.355, - "acc_stderr": 0.015139491543780529 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.01368049572576779 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.3904761904761904 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.433877713602868, - "acc_stderr": 0.00494595674494381, - "acc_norm": 0.5678151762597092, - "acc_norm_stderr": 0.0049436733882762636 - }, - "rte": { - "acc": 0.5054151624548736, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5548539857932123, - "acc_stderr": 0.013967662954355482 - }, - "storycloze_2016": { - "acc": 0.6980224478888295, - "acc_stderr": 0.010616985436073357 - }, - "boolq": { - "acc": 0.5840978593272171, - "acc_stderr": 0.008620469604001021 - }, - "arc_easy": { - "acc": 0.5925925925925926, - "acc_stderr": 0.010082326627832865, - "acc_norm": 0.5824915824915825, - "acc_norm_stderr": 0.010119187377776038 - }, - "arc_challenge": { - "acc": 0.2773037542662116, - "acc_stderr": 0.013082095839059373, - "acc_norm": 0.28668941979522183, - "acc_norm_stderr": 0.013214986329274767 - }, - "sciq": { - "acc": 0.889, - "acc_stderr": 0.009938701010583726, - "acc_norm": 0.884, - "acc_norm_stderr": 0.010131468138757002 - }, - "piqa": { - "acc": 0.736126224156692, - "acc_stderr": 0.010282996367695564, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.010178690109459878 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_5.csv b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..cfa1208f54a01105d5af25c8c5b97803e7d87d30 --- /dev/null +++ b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.332,0.014899597242811473,0 +anli_r2,acc,0.33,0.014876872027456732,0 +anli_r3,acc,0.36583333333333334,0.013910212062701169,0 +arc_challenge,acc,0.2713310580204778,0.012993807727545796,0 +arc_challenge,acc_norm,0.2901023890784983,0.013261573677520769,0 +arc_easy,acc,0.5917508417508418,0.010085566195791247,0 +arc_easy,acc_norm,0.5829124579124579,0.010117738967781988,0 +boolq,acc,0.5785932721712538,0.008636344580414687,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.345062162796153,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.43328022306313485,0.004945157565218194,0 +hellaswag,acc_norm,0.5707030472017527,0.004939642460172596,0 +piqa,acc,0.7317736670293797,0.010336761992404485,0 +piqa,acc_norm,0.7426550598476604,0.010199921064792509,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.899,0.009533618929340983,0 +sciq,acc_norm,0.889,0.009938701010583726,0 +storycloze_2016,acc,0.6857295563869589,0.01073513228510818,0 +winogrande,acc,0.5564325177584846,0.013962694907620402,0 diff --git a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_5_lm-eval_global_step52452_2023-02-01-23-55-44_5shots_backup.json b/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_5_lm-eval_global_step52452_2023-02-01-23-55-44_5shots_backup.json deleted file mode 100644 index 72a8d807e24d8ad5f086caeb9f3ef4dffda45129..0000000000000000000000000000000000000000 --- a/2b855b55bc4seed4/evaluation/rankeval/2b855b55bc4seed4_5_lm-eval_global_step52452_2023-02-01-23-55-44_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811473 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.014876872027456732 - }, - "anli_r3": { - "acc": 0.36583333333333334, - "acc_stderr": 0.013910212062701169 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.345062162796153 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.43328022306313485, - "acc_stderr": 0.004945157565218194, - "acc_norm": 0.5707030472017527, - "acc_norm_stderr": 0.004939642460172596 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5564325177584846, - "acc_stderr": 0.013962694907620402 - }, - "storycloze_2016": { - "acc": 0.6857295563869589, - "acc_stderr": 0.01073513228510818 - }, - "boolq": { - "acc": 0.5785932721712538, - "acc_stderr": 0.008636344580414687 - }, - "arc_easy": { - "acc": 0.5917508417508418, - "acc_stderr": 0.010085566195791247, - "acc_norm": 0.5829124579124579, - "acc_norm_stderr": 0.010117738967781988 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.012993807727545796, - "acc_norm": 0.2901023890784983, - "acc_norm_stderr": 0.013261573677520769 - }, - "sciq": { - "acc": 0.899, - "acc_stderr": 0.009533618929340983, - "acc_norm": 0.889, - "acc_norm_stderr": 0.009938701010583726 - }, - "piqa": { - "acc": 0.7317736670293797, - "acc_stderr": 0.010336761992404485, - "acc_norm": 0.7426550598476604, - "acc_norm_stderr": 0.010199921064792509 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file