Muennighoff commited on
Commit
4137440
·
1 Parent(s): c09aed7
Files changed (31) hide show
  1. 2b855b9bc4seed1/evaluation/generation/merged.csv +16 -2
  2. 2b855b9bc4seed1/evaluation/generation/merged.json +1 -1
  3. 2b855b9bc4seed2/evaluation/generation/merged.csv +20 -2
  4. 2b855b9bc4seed2/evaluation/generation/merged.json +1 -1
  5. 2b855b9bc4seed3/evaluation/generation/merged.csv +20 -2
  6. 2b855b9bc4seed3/evaluation/generation/merged.json +1 -1
  7. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_2.csv +14 -0
  8. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_2.json +39 -1
  9. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_2_lm-eval_global_step52452_2023-02-25-10-40-28_2shots_backup.json +0 -49
  10. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_3.csv +14 -0
  11. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_3.json +56 -1
  12. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_4.csv +14 -0
  13. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_4.json +56 -1
  14. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_5.csv +14 -0
  15. 2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_5.json +56 -1
  16. 2b855b9bc4seed4/evaluation/generation/merged.csv +20 -2
  17. 2b855b9bc4seed4/evaluation/generation/merged.json +1 -1
  18. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_0.csv +14 -0
  19. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_0_lm-eval_global_step52452_2023-02-25-10-40-14_0shots_backup.json +0 -87
  20. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_1.csv +14 -0
  21. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_1.json +29 -1
  22. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_1_lm-eval_global_step52452_2023-02-25-10-40-14_1shots_backup.json +0 -59
  23. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_2.csv +14 -0
  24. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_2.json +39 -1
  25. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_2_lm-eval_global_step52452_2023-02-25-10-40-14_2shots_backup.json +0 -49
  26. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_3.csv +14 -0
  27. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_3.json +56 -1
  28. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_4.csv +14 -0
  29. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_4.json +56 -1
  30. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_5.csv +14 -0
  31. 2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_5.json +56 -1
2b855b9bc4seed1/evaluation/generation/merged.csv CHANGED
@@ -29,11 +29,25 @@ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05142765596627262
29
  web_nlg_en,0,median,rouge2_fmeasure,0.05142765596627262
30
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051755190143320286
31
  web_nlg_en,1,median,rouge2_fmeasure,0.051755190143320286
32
- web_nlg_en,1,average,multiple,0.051591423054796456
 
 
 
 
 
 
 
 
33
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03487145839395421
34
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03487145839395421
35
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03614437414445121
36
  wiki_lingua_en,1,median,rouge2_fmeasure,0.03614437414445121
37
  wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04349341038860359
38
  wiki_lingua_en,2,median,rouge2_fmeasure,0.04349341038860359
39
- wiki_lingua_en,2,average,multiple,0.038169747642336334
 
 
 
 
 
 
 
29
  web_nlg_en,0,median,rouge2_fmeasure,0.05142765596627262
30
  web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051755190143320286
31
  web_nlg_en,1,median,rouge2_fmeasure,0.051755190143320286
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05337272852022536
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.05337272852022536
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05377700106212916
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05377700106212916
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.053735655975499355
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.053735655975499355
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05404322067951011
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05404322067951011
40
+ web_nlg_en,5,average,multiple,0.05301857539115948
41
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03487145839395421
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03487145839395421
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03614437414445121
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.03614437414445121
45
  wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04349341038860359
46
  wiki_lingua_en,2,median,rouge2_fmeasure,0.04349341038860359
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04153907298260113
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04153907298260113
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013711405505692513
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013711405505692513
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002246221879718567
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.002246221879718567
53
+ wiki_lingua_en,5,average,multiple,0.02866765721583687
2b855b9bc4seed1/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4066206711982134, "bleu_stderr": 0.033723013965742, "rouge1_fmeasure": 0.10895883012469396, "rouge1_fmeasure_stderr": 0.002008528704603682, "rouge1_precision": 0.07192732542952907, "rouge1_precision_stderr": 0.0016382475971692088, "rouge1_recall": 0.30547187741815907, "rouge1_recall_stderr": 0.004666013432977962, "rouge2_fmeasure": 0.05142765596627262, "rouge2_fmeasure_stderr": 0.001246824543408159, "rouge2_precision": 0.034124224521766514, "rouge2_precision_stderr": 0.0011258346949061335, "rouge2_recall": 0.14893550564787877, "rouge2_recall_stderr": 0.0032632079712954696, "rougeL_fmeasure": 0.10554163962469583, "rougeL_fmeasure_stderr": 0.0018999401584976951, "rougeL_precision": 0.06951188299150508, "rougeL_precision_stderr": 0.0015536321007724944, "rougeL_recall": 0.29797998285094185, "rougeL_recall_stderr": 0.004578539464668976, "rougeLsum_fmeasure": 0.10415553154282955, "rougeLsum_fmeasure_stderr": 0.0018954370177080594, "rougeLsum_precision": 0.06880186502940373, "rougeLsum_precision_stderr": 0.0015660770018247742, "rougeLsum_recall": 0.2918595743249444, "rougeLsum_recall_stderr": 0.004365973340593178}}, "1": {"PALM_prompt": {"bleu": 0.441015227234624, "bleu_stderr": 0.02811928663369506, "rouge1_fmeasure": 0.11213595098636865, "rouge1_fmeasure_stderr": 0.001929589035018206, "rouge1_precision": 0.0726548726317011, "rouge1_precision_stderr": 0.0015123203660414558, "rouge1_recall": 0.3480211924440434, "rouge1_recall_stderr": 0.005007162611700053, "rouge2_fmeasure": 0.051755190143320286, "rouge2_fmeasure_stderr": 0.0012231844221965455, "rouge2_precision": 0.0335432322481532, "rouge2_precision_stderr": 0.0009695182541133993, "rouge2_recall": 0.16639593533541566, "rouge2_recall_stderr": 0.0034320944741141886, "rougeL_fmeasure": 0.10765664842911221, "rougeL_fmeasure_stderr": 0.001806223035008682, "rougeL_precision": 0.06967949917271966, "rougeL_precision_stderr": 0.0014170582927231956, "rougeL_recall": 0.3335173366403848, "rougeL_recall_stderr": 0.004760042112091419, "rougeLsum_fmeasure": 0.10716257387185534, "rougeLsum_fmeasure_stderr": 0.0018203621728808315, "rougeLsum_precision": 0.06949682130009485, "rougeLsum_precision_stderr": 0.00144169867509152, "rougeLsum_recall": 0.33110876740142015, "rougeLsum_recall_stderr": 0.004635864354540981}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5382561696068602, "bleu_stderr": 0.06399544328700199, "rouge1_fmeasure": 0.17482546253918968, "rouge1_fmeasure_stderr": 0.0017962093241495317, "rouge1_precision": 0.1493431846788089, "rouge1_precision_stderr": 0.0018651057951166279, "rouge1_recall": 0.2548583336571623, "rouge1_recall_stderr": 0.002528299389518057, "rouge2_fmeasure": 0.03487145839395421, "rouge2_fmeasure_stderr": 0.0008257165713416542, "rouge2_precision": 0.02958059388980271, "rouge2_precision_stderr": 0.0007357126571931123, "rouge2_recall": 0.0521633146578729, "rouge2_recall_stderr": 0.0013585554854051705, "rougeL_fmeasure": 0.13645058466848356, "rougeL_fmeasure_stderr": 0.0012505081899530457, "rougeL_precision": 0.11482322910631781, "rougeL_precision_stderr": 0.0012610070217646853, "rougeL_recall": 0.2047896774759962, "rougeL_recall_stderr": 0.0020475160072816, "rougeLsum_fmeasure": 0.16137882388270616, "rougeLsum_fmeasure_stderr": 0.0016412506052086466, "rougeLsum_precision": 0.13765195894138185, "rougeLsum_precision_stderr": 0.0017075576824449126, "rougeLsum_recall": 0.23641320862844858, "rougeLsum_recall_stderr": 0.0023610992069355388}}, "1": {"tldr_en": {"bleu": 1.7101242197452051, "bleu_stderr": 0.04333761308856783, "rouge1_fmeasure": 0.1796806784514, "rouge1_fmeasure_stderr": 0.0018714401391125607, "rouge1_precision": 0.1550521213351438, "rouge1_precision_stderr": 0.0019541763971361132, "rouge1_recall": 0.2591201747750158, "rouge1_recall_stderr": 0.0026478655932746374, "rouge2_fmeasure": 0.03614437414445121, "rouge2_fmeasure_stderr": 0.0008460043362571757, "rouge2_precision": 0.030990812889466937, "rouge2_precision_stderr": 0.0007609017578133211, "rouge2_recall": 0.05362608829041054, "rouge2_recall_stderr": 0.001393078096579126, "rougeL_fmeasure": 0.13484320380645512, "rougeL_fmeasure_stderr": 0.001257828140076275, "rougeL_precision": 0.11485232716958933, "rougeL_precision_stderr": 0.0012878987216988057, "rougeL_recall": 0.19958644094767267, "rougeL_recall_stderr": 0.002041519133152489, "rougeLsum_fmeasure": 0.16735135112097876, "rougeLsum_fmeasure_stderr": 0.0017317067736812232, "rougeLsum_precision": 0.14415005003230233, "rougeLsum_precision_stderr": 0.0018020269635249566, "rougeLsum_recall": 0.24228585475636266, "rougeLsum_recall_stderr": 0.0024919558768600326}}, "2": {"tldr_en": {"bleu": 2.1448539044725874, "bleu_stderr": 0.06277677171239805, "rouge1_fmeasure": 0.1970048673630101, "rouge1_fmeasure_stderr": 0.001873251264352212, "rouge1_precision": 0.1705870674804116, "rouge1_precision_stderr": 0.002048796485044356, "rouge1_recall": 0.28464437226221473, "rouge1_recall_stderr": 0.0026857267394890393, "rouge2_fmeasure": 0.04349341038860359, "rouge2_fmeasure_stderr": 0.000895910412734044, "rouge2_precision": 0.0375931104436251, "rouge2_precision_stderr": 0.0008379763934403864, "rouge2_recall": 0.0642856259647273, "rouge2_recall_stderr": 0.0014690192240657321, "rougeL_fmeasure": 0.1456514344157347, "rougeL_fmeasure_stderr": 0.0012777745388170482, "rougeL_precision": 0.12469459823199261, "rougeL_precision_stderr": 0.001380245091026989, "rougeL_recall": 0.21573877115274695, "rougeL_recall_stderr": 0.002116828674796055, "rougeLsum_fmeasure": 0.18373940262798896, "rougeLsum_fmeasure_stderr": 0.001747803293779092, "rougeLsum_precision": 0.15874261945387053, "rougeLsum_precision_stderr": 0.0018989313183848545, "rougeLsum_recall": 0.26633944675442933, "rougeLsum_recall_stderr": 0.0025440410206595122}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.23848794691656622, "bleu_stderr": 0.03586833580767261, "rouge1_fmeasure": 0.02476955743708725, "rouge1_fmeasure_stderr": 0.0007710556690163732, "rouge1_precision": 0.019286698361945742, "rouge1_precision_stderr": 0.0006077660476531964, "rouge1_recall": 0.03771027274112491, "rouge1_recall_stderr": 0.0012643509148260043, "rouge2_fmeasure": 0.0025241396159283234, "rouge2_fmeasure_stderr": 0.00028150795832003375, "rouge2_precision": 0.0018866790365766187, "rouge2_precision_stderr": 0.0002143326315912907, "rouge2_recall": 0.004268812953176167, "rouge2_recall_stderr": 0.00046738906149789073, "rougeL_fmeasure": 0.02408262386824008, "rougeL_fmeasure_stderr": 0.0007136995922629838, "rougeL_precision": 0.018711596296599558, "rougeL_precision_stderr": 0.000549656942001388, "rougeL_recall": 0.036704663603431754, "rougeL_recall_stderr": 0.0011874696198706125, "rougeLsum_fmeasure": 0.022774229967586195, "rougeLsum_fmeasure_stderr": 0.0006671086701287083, "rougeLsum_precision": 0.017739797167011156, "rougeLsum_precision_stderr": 0.0005177890835136993, "rougeLsum_recall": 0.03456546163273569, "rougeLsum_recall_stderr": 0.0011034522576739757}}, "1": {"generate_text_restaurant": {"bleu": 7.611947046679122, "bleu_stderr": 0.1143274370287527, "rouge1_fmeasure": 0.37951927025471477, "rouge1_fmeasure_stderr": 0.002437652542254183, "rouge1_precision": 0.39546550497235883, "rouge1_precision_stderr": 0.00338263349515896, "rouge1_recall": 0.41781588842973927, "rouge1_recall_stderr": 0.0028231707574303336, "rouge2_fmeasure": 0.162134939165695, "rouge2_fmeasure_stderr": 0.0018063315065409062, "rouge2_precision": 0.17176180426713916, "rouge2_precision_stderr": 0.002231883030057573, "rouge2_recall": 0.17748075850051998, "rouge2_recall_stderr": 0.0020235877252369762, "rougeL_fmeasure": 0.2827204639054135, "rougeL_fmeasure_stderr": 0.001840892993788695, "rougeL_precision": 0.29255451415473116, "rougeL_precision_stderr": 0.0025475028016928546, "rougeL_recall": 0.31556464887995433, "rougeL_recall_stderr": 0.0023200773160819157, "rougeLsum_fmeasure": 0.3085637675341545, "rougeLsum_fmeasure_stderr": 0.002263042771686147, "rougeLsum_precision": 0.3218545869658081, "rougeLsum_precision_stderr": 0.0030063312376711593, "rougeLsum_recall": 0.33940243915149054, "rougeLsum_recall_stderr": 0.002593279963684452}}, "2": {"generate_text_restaurant": {"bleu": 6.018041940580495, "bleu_stderr": 0.07197788580096429, "rouge1_fmeasure": 0.32521240877335217, "rouge1_fmeasure_stderr": 0.0019278160903662194, "rouge1_precision": 0.26958144890836977, "rouge1_precision_stderr": 0.0021540233079166827, "rouge1_recall": 0.4588728570611039, "rouge1_recall_stderr": 0.0027449063068318643, "rouge2_fmeasure": 0.13955166174444095, "rouge2_fmeasure_stderr": 0.0014269312475032364, "rouge2_precision": 0.11570444930629259, "rouge2_precision_stderr": 0.0014199073558424482, "rouge2_recall": 0.2004243711941686, "rouge2_recall_stderr": 0.0021042737791853398, "rougeL_fmeasure": 0.262564054866777, "rougeL_fmeasure_stderr": 0.0014381342603292985, "rougeL_precision": 0.2156185303251572, "rougeL_precision_stderr": 0.0015614536366490293, "rougeL_recall": 0.3759398467983639, "rougeL_recall_stderr": 0.002381221921078589, "rougeLsum_fmeasure": 0.2680607212245618, "rougeLsum_fmeasure_stderr": 0.0018300140862428237, "rougeLsum_precision": 0.22224277917141733, "rougeLsum_precision_stderr": 0.0019418664704165772, "rougeLsum_recall": 0.3788810896543758, "rougeLsum_recall_stderr": 0.0026592931094796103}}, "3": {"generate_text_restaurant": {"bleu": 6.247143410389527, "bleu_stderr": 0.09336005465042321, "rouge1_fmeasure": 0.32338937321619443, "rouge1_fmeasure_stderr": 0.0017934056667561753, "rouge1_precision": 0.25907184832717167, "rouge1_precision_stderr": 0.001803996044494107, "rouge1_recall": 0.4710593035922337, "rouge1_recall_stderr": 0.0027215741063189363, "rouge2_fmeasure": 0.14093386031313695, "rouge2_fmeasure_stderr": 0.0013638568945346288, "rouge2_precision": 0.11204434793169746, "rouge2_precision_stderr": 0.0012125669003284166, "rouge2_recall": 0.21007032469689518, "rouge2_recall_stderr": 0.0021257583048451844, "rougeL_fmeasure": 0.2638096769634126, "rougeL_fmeasure_stderr": 0.0013632619425518314, "rougeL_precision": 0.2101184628317088, "rougeL_precision_stderr": 0.001355833935975265, "rougeL_recall": 0.3886278435049077, "rougeL_recall_stderr": 0.0023407273478071285, "rougeLsum_fmeasure": 0.2689516848427162, "rougeLsum_fmeasure_stderr": 0.0017385224354802351, "rougeLsum_precision": 0.21561354041284292, "rougeLsum_precision_stderr": 0.0016830247054440658, "rougeLsum_recall": 0.3919032879874159, "rougeLsum_recall_stderr": 0.002632582004466588}}, "4": {"generate_text_restaurant": {"bleu": 6.208222380745828, "bleu_stderr": 0.09813424389228141, "rouge1_fmeasure": 0.32023650433333456, "rouge1_fmeasure_stderr": 0.0017784408554274436, "rouge1_precision": 0.25403488264023616, "rouge1_precision_stderr": 0.00174969774558582, "rouge1_recall": 0.4709476911476187, "rouge1_recall_stderr": 0.002655138227534261, "rouge2_fmeasure": 0.1403526845297047, "rouge2_fmeasure_stderr": 0.0013607857847417898, "rouge2_precision": 0.11058685749210603, "rouge2_precision_stderr": 0.0012080055063255909, "rouge2_recall": 0.21167766216290337, "rouge2_recall_stderr": 0.002144496564072756, "rougeL_fmeasure": 0.26122109578349895, "rougeL_fmeasure_stderr": 0.0013494634641885617, "rougeL_precision": 0.20576507091031662, "rougeL_precision_stderr": 0.0012824323500063948, "rougeL_recall": 0.3889751930979724, "rougeL_recall_stderr": 0.0023221115722203652, "rougeLsum_fmeasure": 0.2670143871011222, "rougeLsum_fmeasure_stderr": 0.0017625078468513325, "rougeLsum_precision": 0.21178966636346616, "rougeLsum_precision_stderr": 0.0016557432144235155, "rougeLsum_recall": 0.393450062046956, "rougeLsum_recall_stderr": 0.002663580316470773}}, "5": {"generate_text_restaurant": {"bleu": 6.148604351164331, "bleu_stderr": 0.11104852380350659, "rouge1_fmeasure": 0.3199925141836084, "rouge1_fmeasure_stderr": 0.0017551605827543929, "rouge1_precision": 0.2528456956077577, "rouge1_precision_stderr": 0.0017424978255548198, "rouge1_recall": 0.473017800721126, "rouge1_recall_stderr": 0.0026123356126015254, "rouge2_fmeasure": 0.13979285541625697, "rouge2_fmeasure_stderr": 0.0013089259606601385, "rouge2_precision": 0.10974089008299018, "rouge2_precision_stderr": 0.0011789832834562743, "rouge2_recall": 0.2119589387667116, "rouge2_recall_stderr": 0.0020903039724611107, "rougeL_fmeasure": 0.2608746206749859, "rougeL_fmeasure_stderr": 0.0013206910936061742, "rougeL_precision": 0.20463239752897472, "rougeL_precision_stderr": 0.0012743661964013022, "rougeL_recall": 0.3904493381375762, "rougeL_recall_stderr": 0.002278919766787325, "rougeLsum_fmeasure": 0.26667287137591805, "rougeLsum_fmeasure_stderr": 0.001737453548353309, "rougeLsum_precision": 0.21056870894114044, "rougeLsum_precision_stderr": 0.001637983619116459, "rougeLsum_recall": 0.39500142434961627, "rougeLsum_recall_stderr": 0.002625905199940031}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8439516808341079, "bleu_stderr": 0.09422907248075703, "rouge1_fmeasure": 0.20643072459177322, "rouge1_fmeasure_stderr": 0.0023952712703594957, "rouge1_precision": 0.1489888413499883, "rouge1_precision_stderr": 0.0018521966676135623, "rouge1_recall": 0.35598499223601365, "rouge1_recall_stderr": 0.004207211166343017, "rouge2_fmeasure": 0.04580671342088997, "rouge2_fmeasure_stderr": 0.0015030605732593261, "rouge2_precision": 0.0324623282960817, "rouge2_precision_stderr": 0.0010674592282863239, "rouge2_recall": 0.0820809502686687, "rouge2_recall_stderr": 0.002813424567852905, "rougeL_fmeasure": 0.15483993335255772, "rougeL_fmeasure_stderr": 0.0018017433912559451, "rougeL_precision": 0.11153813287033311, "rougeL_precision_stderr": 0.001374306634500298, "rougeL_recall": 0.26864922067574026, "rougeL_recall_stderr": 0.0033157904418926283, "rougeLsum_fmeasure": 0.16399947665135114, "rougeLsum_fmeasure_stderr": 0.002023909905331339, "rougeLsum_precision": 0.1179757836593057, "rougeLsum_precision_stderr": 0.0015154466049348198, "rougeLsum_recall": 0.2847650880272033, "rougeLsum_recall_stderr": 0.003712939966528145}}, "1": {"article_DOC_summary": {"bleu": 1.247854057178413, "bleu_stderr": 0.053793972634237734, "rouge1_fmeasure": 0.1655189451131417, "rouge1_fmeasure_stderr": 0.0023018941372401455, "rouge1_precision": 0.1177467243515202, "rouge1_precision_stderr": 0.0017455506749855451, "rouge1_recall": 0.29171927966859307, "rouge1_recall_stderr": 0.003919612515883716, "rouge2_fmeasure": 0.03052402695525932, "rouge2_fmeasure_stderr": 0.0012776222243206263, "rouge2_precision": 0.02141171153130294, "rouge2_precision_stderr": 0.0008955978971257212, "rouge2_recall": 0.05531529147706192, "rouge2_recall_stderr": 0.002372274102893343, "rougeL_fmeasure": 0.13085315336542225, "rougeL_fmeasure_stderr": 0.0017506812383595754, "rougeL_precision": 0.0927268161322543, "rougeL_precision_stderr": 0.0012892522930530307, "rougeL_recall": 0.232421966160188, "rougeL_recall_stderr": 0.0031136409947685123, "rougeLsum_fmeasure": 0.13257722034008568, "rougeLsum_fmeasure_stderr": 0.0018892826906714033, "rougeLsum_precision": 0.09395615061271292, "rougeLsum_precision_stderr": 0.0013824641506762928, "rougeLsum_recall": 0.23524314570007646, "rougeLsum_recall_stderr": 0.0033502788795521077}}, "2": {"article_DOC_summary": {"bleu": 1.1975588661664014, "bleu_stderr": 0.06263558549987819, "rouge1_fmeasure": 0.165998653508353, "rouge1_fmeasure_stderr": 0.002288522145343238, "rouge1_precision": 0.1177626659310641, "rouge1_precision_stderr": 0.001707946986197866, "rouge1_recall": 0.2928721346826048, "rouge1_recall_stderr": 0.0039006436966328977, "rouge2_fmeasure": 0.029102694813621793, "rouge2_fmeasure_stderr": 0.0012269354923261043, "rouge2_precision": 0.020528446471707117, "rouge2_precision_stderr": 0.000867911954165022, "rouge2_recall": 0.052169823470780564, "rouge2_recall_stderr": 0.0022808799038575088, "rougeL_fmeasure": 0.12917404609126437, "rougeL_fmeasure_stderr": 0.0016814421892826735, "rougeL_precision": 0.09145048851843915, "rougeL_precision_stderr": 0.0012506279990524979, "rougeL_recall": 0.22942628081026484, "rougeL_recall_stderr": 0.002973056999961359, "rougeLsum_fmeasure": 0.13296050139540747, "rougeLsum_fmeasure_stderr": 0.001845998235299578, "rougeLsum_precision": 0.0940902852266538, "rougeLsum_precision_stderr": 0.0013601724232084945, "rougeLsum_recall": 0.23630864651546127, "rougeLsum_recall_stderr": 0.003290166668177434}}, "3": {"article_DOC_summary": {"bleu": 1.1532375468354903, "bleu_stderr": 0.09169402373692534, "rouge1_fmeasure": 0.15647200121804894, "rouge1_fmeasure_stderr": 0.0023727238595388817, "rouge1_precision": 0.1131657921685037, "rouge1_precision_stderr": 0.0018249204798791675, "rouge1_recall": 0.2722598710120147, "rouge1_recall_stderr": 0.004133665114298229, "rouge2_fmeasure": 0.027201929614022874, "rouge2_fmeasure_stderr": 0.0012539141722579825, "rouge2_precision": 0.019363405482896792, "rouge2_precision_stderr": 0.0009105840932812386, "rouge2_recall": 0.049210613825466105, "rouge2_recall_stderr": 0.002366221262626497, "rougeL_fmeasure": 0.12268909574632023, "rougeL_fmeasure_stderr": 0.001773225086338, "rougeL_precision": 0.08876222177060916, "rougeL_precision_stderr": 0.0013761155158951413, "rougeL_recall": 0.21448631404181784, "rougeL_recall_stderr": 0.003198193852945419, "rougeLsum_fmeasure": 0.12593169516392616, "rougeLsum_fmeasure_stderr": 0.001962773566689985, "rougeLsum_precision": 0.09102722612426341, "rougeLsum_precision_stderr": 0.0014951062687195294, "rougeLsum_recall": 0.22021791608209987, "rougeLsum_recall_stderr": 0.003544530658562865}}, "4": {"article_DOC_summary": {"bleu": 0.6055404023350197, "bleu_stderr": 0.09409517819503949, "rouge1_fmeasure": 0.04426046707797172, "rouge1_fmeasure_stderr": 0.0024997989992097624, "rouge1_precision": 0.0383955897314317, "rouge1_precision_stderr": 0.0025422863710305266, "rouge1_recall": 0.06915759782304308, "rouge1_recall_stderr": 0.0039608171509673586, "rouge2_fmeasure": 0.008141029291990268, "rouge2_fmeasure_stderr": 0.0008202019731008721, "rouge2_precision": 0.006481358010154669, "rouge2_precision_stderr": 0.0007393187434393857, "rouge2_recall": 0.013044537342091751, "rouge2_recall_stderr": 0.00125152402986618, "rougeL_fmeasure": 0.035265847972026074, "rougeL_fmeasure_stderr": 0.0020006364168593496, "rougeL_precision": 0.031113719615522017, "rougeL_precision_stderr": 0.0021898843752975964, "rougeL_recall": 0.055197273257753664, "rougeL_recall_stderr": 0.003191098630834803, "rougeLsum_fmeasure": 0.03632930315349291, "rougeLsum_fmeasure_stderr": 0.0020756841173550696, "rougeLsum_precision": 0.03210804984465074, "rougeLsum_precision_stderr": 0.0022623416009452243, "rougeLsum_recall": 0.05680048575290953, "rougeLsum_recall_stderr": 0.0032931638988068055}}, "5": {"article_DOC_summary": {"bleu": 1.1094899638874875e-37, "bleu_stderr": 1.0662006738968273e-31, "rouge1_fmeasure": 0.0024831420596845987, "rouge1_fmeasure_stderr": 0.0007070331562075694, "rouge1_precision": 0.002821793157093116, "rouge1_precision_stderr": 0.0008106295472902334, "rouge1_recall": 0.0022851464113943067, "rouge1_recall_stderr": 0.0006484962567584845, "rouge2_fmeasure": 0.0002519246692387647, "rouge2_fmeasure_stderr": 0.00017508831809454146, "rouge2_precision": 0.0002914830883754302, "rouge2_precision_stderr": 0.00020340968454234058, "rouge2_recall": 0.00022401756585996315, "rouge2_recall_stderr": 0.0001545711517822221, "rougeL_fmeasure": 0.001910475794160408, "rougeL_fmeasure_stderr": 0.0005775932958294451, "rougeL_precision": 0.0021547453202339587, "rougeL_precision_stderr": 0.0006563220446223475, "rougeL_recall": 0.0017708458456009723, "rougeL_recall_stderr": 0.0005325445996971974, "rougeLsum_fmeasure": 0.0020447516574242644, "rougeLsum_fmeasure_stderr": 0.000612874204929875, "rougeLsum_precision": 0.002307213397230338, "rougeLsum_precision_stderr": 0.0006934611042748366, "rougeLsum_recall": 0.001892317665185999, "rougeLsum_recall_stderr": 0.0005664327179419651}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4066206711982134, "bleu_stderr": 0.033723013965742, "rouge1_fmeasure": 0.10895883012469396, "rouge1_fmeasure_stderr": 0.002008528704603682, "rouge1_precision": 0.07192732542952907, "rouge1_precision_stderr": 0.0016382475971692088, "rouge1_recall": 0.30547187741815907, "rouge1_recall_stderr": 0.004666013432977962, "rouge2_fmeasure": 0.05142765596627262, "rouge2_fmeasure_stderr": 0.001246824543408159, "rouge2_precision": 0.034124224521766514, "rouge2_precision_stderr": 0.0011258346949061335, "rouge2_recall": 0.14893550564787877, "rouge2_recall_stderr": 0.0032632079712954696, "rougeL_fmeasure": 0.10554163962469583, "rougeL_fmeasure_stderr": 0.0018999401584976951, "rougeL_precision": 0.06951188299150508, "rougeL_precision_stderr": 0.0015536321007724944, "rougeL_recall": 0.29797998285094185, "rougeL_recall_stderr": 0.004578539464668976, "rougeLsum_fmeasure": 0.10415553154282955, "rougeLsum_fmeasure_stderr": 0.0018954370177080594, "rougeLsum_precision": 0.06880186502940373, "rougeLsum_precision_stderr": 0.0015660770018247742, "rougeLsum_recall": 0.2918595743249444, "rougeLsum_recall_stderr": 0.004365973340593178}}, "1": {"PALM_prompt": {"bleu": 0.441015227234624, "bleu_stderr": 0.02811928663369506, "rouge1_fmeasure": 0.11213595098636865, "rouge1_fmeasure_stderr": 0.001929589035018206, "rouge1_precision": 0.0726548726317011, "rouge1_precision_stderr": 0.0015123203660414558, "rouge1_recall": 0.3480211924440434, "rouge1_recall_stderr": 0.005007162611700053, "rouge2_fmeasure": 0.051755190143320286, "rouge2_fmeasure_stderr": 0.0012231844221965455, "rouge2_precision": 0.0335432322481532, "rouge2_precision_stderr": 0.0009695182541133993, "rouge2_recall": 0.16639593533541566, "rouge2_recall_stderr": 0.0034320944741141886, "rougeL_fmeasure": 0.10765664842911221, "rougeL_fmeasure_stderr": 0.001806223035008682, "rougeL_precision": 0.06967949917271966, "rougeL_precision_stderr": 0.0014170582927231956, "rougeL_recall": 0.3335173366403848, "rougeL_recall_stderr": 0.004760042112091419, "rougeLsum_fmeasure": 0.10716257387185534, "rougeLsum_fmeasure_stderr": 0.0018203621728808315, "rougeLsum_precision": 0.06949682130009485, "rougeLsum_precision_stderr": 0.00144169867509152, "rougeLsum_recall": 0.33110876740142015, "rougeLsum_recall_stderr": 0.004635864354540981}}, "2": {"PALM_prompt": {"bleu": 0.4879530680626219, "bleu_stderr": 0.022226561756050065, "rouge1_fmeasure": 0.11576991782120694, "rouge1_fmeasure_stderr": 0.0017767478418792807, "rouge1_precision": 0.07450171294409999, "rouge1_precision_stderr": 0.0014273076118505135, "rouge1_recall": 0.3721602744426868, "rouge1_recall_stderr": 0.005007331479391403, "rouge2_fmeasure": 0.05337272852022536, "rouge2_fmeasure_stderr": 0.0011355711345846292, "rouge2_precision": 0.0339057544893547, "rouge2_precision_stderr": 0.0008164561648565925, "rouge2_recall": 0.18184437840047113, "rouge2_recall_stderr": 0.0035778407669343807, "rougeL_fmeasure": 0.10917124493922556, "rougeL_fmeasure_stderr": 0.0016360379225532403, "rougeL_precision": 0.07019218668467868, "rougeL_precision_stderr": 0.0013187112542434566, "rougeL_recall": 0.35049899587977895, "rougeL_recall_stderr": 0.004645247539219549, "rougeLsum_fmeasure": 0.11027371768557967, "rougeLsum_fmeasure_stderr": 0.001672349736688818, "rougeLsum_precision": 0.0710033634035766, "rougeLsum_precision_stderr": 0.0013509496399884485, "rougeLsum_recall": 0.35308766126574853, "rougeLsum_recall_stderr": 0.0046035050696858785}}, "3": {"PALM_prompt": {"bleu": 0.5301668420224878, "bleu_stderr": 0.03569428708079646, "rouge1_fmeasure": 0.1171472708002559, "rouge1_fmeasure_stderr": 0.0017816548360566188, "rouge1_precision": 0.07488836003765288, "rouge1_precision_stderr": 0.0013554555261273335, "rouge1_recall": 0.3807943128379292, "rouge1_recall_stderr": 0.005068093354553868, "rouge2_fmeasure": 0.05377700106212916, "rouge2_fmeasure_stderr": 0.0011360301606876445, "rouge2_precision": 0.03427049927749005, "rouge2_precision_stderr": 0.0008265258547875084, "rouge2_recall": 0.18483650746981256, "rouge2_recall_stderr": 0.003608882407080203, "rougeL_fmeasure": 0.10993076451110965, "rougeL_fmeasure_stderr": 0.0016489346520097603, "rougeL_precision": 0.07027899744037729, "rougeL_precision_stderr": 0.001245277127335206, "rougeL_recall": 0.3555258422023413, "rougeL_recall_stderr": 0.0046123886903595935, "rougeLsum_fmeasure": 0.1111418505334578, "rougeLsum_fmeasure_stderr": 0.0016828926127784973, "rougeLsum_precision": 0.07110543618912658, "rougeLsum_precision_stderr": 0.0012791823741695914, "rougeLsum_recall": 0.3590687947063441, "rougeLsum_recall_stderr": 0.00463365260981097}}, "4": {"PALM_prompt": {"bleu": 0.5348156799398361, "bleu_stderr": 0.031415178247695734, "rouge1_fmeasure": 0.1164756890672599, "rouge1_fmeasure_stderr": 0.001758429091091271, "rouge1_precision": 0.07390461269243237, "rouge1_precision_stderr": 0.001293996633238653, "rouge1_recall": 0.38041071103283647, "rouge1_recall_stderr": 0.005023925872353074, "rouge2_fmeasure": 0.053735655975499355, "rouge2_fmeasure_stderr": 0.0011371487191998832, "rouge2_precision": 0.03395191423816771, "rouge2_precision_stderr": 0.0008189663038696395, "rouge2_recall": 0.1866135607625151, "rouge2_recall_stderr": 0.0035601351343611675, "rougeL_fmeasure": 0.10880311687790475, "rougeL_fmeasure_stderr": 0.0016083844853606128, "rougeL_precision": 0.06908460337868647, "rougeL_precision_stderr": 0.0011940464395063983, "rougeL_recall": 0.35473943451259965, "rougeL_recall_stderr": 0.004532226704166, "rougeLsum_fmeasure": 0.11092328027483066, "rougeLsum_fmeasure_stderr": 0.001672147704716892, "rougeLsum_precision": 0.0704742694632504, "rougeLsum_precision_stderr": 0.0012395099584306879, "rougeLsum_recall": 0.3605369289230848, "rougeLsum_recall_stderr": 0.004596776569640385}}, "5": {"PALM_prompt": {"bleu": 0.5732238081850473, "bleu_stderr": 0.03036082938574753, "rouge1_fmeasure": 0.11651929897196142, "rouge1_fmeasure_stderr": 0.0017387855983505528, "rouge1_precision": 0.07377845593101123, "rouge1_precision_stderr": 0.0012899306320703606, "rouge1_recall": 0.38865641410241025, "rouge1_recall_stderr": 0.005154465126771031, "rouge2_fmeasure": 0.05404322067951011, "rouge2_fmeasure_stderr": 0.0011183334922281324, "rouge2_precision": 0.03410978907679326, "rouge2_precision_stderr": 0.0008056298739679156, "rouge2_recall": 0.1920087524119153, "rouge2_recall_stderr": 0.0037563355013072487, "rougeL_fmeasure": 0.1075376368299557, "rougeL_fmeasure_stderr": 0.0015771981017094758, "rougeL_precision": 0.06813048579497075, "rougeL_precision_stderr": 0.0011691987048583971, "rougeL_recall": 0.35794433381414287, "rougeL_recall_stderr": 0.004634098332478846, "rougeLsum_fmeasure": 0.11046544450032314, "rougeLsum_fmeasure_stderr": 0.0016439966953760371, "rougeLsum_precision": 0.06995867144678722, "rougeLsum_precision_stderr": 0.001211431300670937, "rougeLsum_recall": 0.36749023175344364, "rougeLsum_recall_stderr": 0.00476284510936873}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5382561696068602, "bleu_stderr": 0.06399544328700199, "rouge1_fmeasure": 0.17482546253918968, "rouge1_fmeasure_stderr": 0.0017962093241495317, "rouge1_precision": 0.1493431846788089, "rouge1_precision_stderr": 0.0018651057951166279, "rouge1_recall": 0.2548583336571623, "rouge1_recall_stderr": 0.002528299389518057, "rouge2_fmeasure": 0.03487145839395421, "rouge2_fmeasure_stderr": 0.0008257165713416542, "rouge2_precision": 0.02958059388980271, "rouge2_precision_stderr": 0.0007357126571931123, "rouge2_recall": 0.0521633146578729, "rouge2_recall_stderr": 0.0013585554854051705, "rougeL_fmeasure": 0.13645058466848356, "rougeL_fmeasure_stderr": 0.0012505081899530457, "rougeL_precision": 0.11482322910631781, "rougeL_precision_stderr": 0.0012610070217646853, "rougeL_recall": 0.2047896774759962, "rougeL_recall_stderr": 0.0020475160072816, "rougeLsum_fmeasure": 0.16137882388270616, "rougeLsum_fmeasure_stderr": 0.0016412506052086466, "rougeLsum_precision": 0.13765195894138185, "rougeLsum_precision_stderr": 0.0017075576824449126, "rougeLsum_recall": 0.23641320862844858, "rougeLsum_recall_stderr": 0.0023610992069355388}}, "1": {"tldr_en": {"bleu": 1.7101242197452051, "bleu_stderr": 0.04333761308856783, "rouge1_fmeasure": 0.1796806784514, "rouge1_fmeasure_stderr": 0.0018714401391125607, "rouge1_precision": 0.1550521213351438, "rouge1_precision_stderr": 0.0019541763971361132, "rouge1_recall": 0.2591201747750158, "rouge1_recall_stderr": 0.0026478655932746374, "rouge2_fmeasure": 0.03614437414445121, "rouge2_fmeasure_stderr": 0.0008460043362571757, "rouge2_precision": 0.030990812889466937, "rouge2_precision_stderr": 0.0007609017578133211, "rouge2_recall": 0.05362608829041054, "rouge2_recall_stderr": 0.001393078096579126, "rougeL_fmeasure": 0.13484320380645512, "rougeL_fmeasure_stderr": 0.001257828140076275, "rougeL_precision": 0.11485232716958933, "rougeL_precision_stderr": 0.0012878987216988057, "rougeL_recall": 0.19958644094767267, "rougeL_recall_stderr": 0.002041519133152489, "rougeLsum_fmeasure": 0.16735135112097876, "rougeLsum_fmeasure_stderr": 0.0017317067736812232, "rougeLsum_precision": 0.14415005003230233, "rougeLsum_precision_stderr": 0.0018020269635249566, "rougeLsum_recall": 0.24228585475636266, "rougeLsum_recall_stderr": 0.0024919558768600326}}, "2": {"tldr_en": {"bleu": 2.1448539044725874, "bleu_stderr": 0.06277677171239805, "rouge1_fmeasure": 0.1970048673630101, "rouge1_fmeasure_stderr": 0.001873251264352212, "rouge1_precision": 0.1705870674804116, "rouge1_precision_stderr": 0.002048796485044356, "rouge1_recall": 0.28464437226221473, "rouge1_recall_stderr": 0.0026857267394890393, "rouge2_fmeasure": 0.04349341038860359, "rouge2_fmeasure_stderr": 0.000895910412734044, "rouge2_precision": 0.0375931104436251, "rouge2_precision_stderr": 0.0008379763934403864, "rouge2_recall": 0.0642856259647273, "rouge2_recall_stderr": 0.0014690192240657321, "rougeL_fmeasure": 0.1456514344157347, "rougeL_fmeasure_stderr": 0.0012777745388170482, "rougeL_precision": 0.12469459823199261, "rougeL_precision_stderr": 0.001380245091026989, "rougeL_recall": 0.21573877115274695, "rougeL_recall_stderr": 0.002116828674796055, "rougeLsum_fmeasure": 0.18373940262798896, "rougeLsum_fmeasure_stderr": 0.001747803293779092, "rougeLsum_precision": 0.15874261945387053, "rougeLsum_precision_stderr": 0.0018989313183848545, "rougeLsum_recall": 0.26633944675442933, "rougeLsum_recall_stderr": 0.0025440410206595122}}, "3": {"tldr_en": {"bleu": 2.6219935635481972, "bleu_stderr": 0.08393574440404722, "rouge1_fmeasure": 0.1743984403763459, "rouge1_fmeasure_stderr": 0.0022511670759053945, "rouge1_precision": 0.15716025539089434, "rouge1_precision_stderr": 0.0024592378552505028, "rouge1_recall": 0.2504040133909296, "rouge1_recall_stderr": 0.00322686952791858, "rouge2_fmeasure": 0.04153907298260113, "rouge2_fmeasure_stderr": 0.0009828671477776307, "rouge2_precision": 0.03770029321333559, "rouge2_precision_stderr": 0.0010493856839055736, "rouge2_recall": 0.06139263756637512, "rouge2_recall_stderr": 0.0015683949110632596, "rougeL_fmeasure": 0.12786858504764273, "rougeL_fmeasure_stderr": 0.0015702622825614777, "rougeL_precision": 0.1148399310970495, "rougeL_precision_stderr": 0.001785457930450565, "rougeL_recall": 0.18822312963677912, "rougeL_recall_stderr": 0.0025054897190357655, "rougeLsum_fmeasure": 0.16332461870887288, "rougeLsum_fmeasure_stderr": 0.00211104112963892, "rougeLsum_precision": 0.1472078800267912, "rougeLsum_precision_stderr": 0.0023221753474528643, "rougeLsum_recall": 0.23528203295989167, "rougeLsum_recall_stderr": 0.0030629309349833227}}, "4": {"tldr_en": {"bleu": 0.5649744001319206, "bleu_stderr": 0.030824473521345572, "rouge1_fmeasure": 0.057054489476321614, "rouge1_fmeasure_stderr": 0.0019358185950117408, "rouge1_precision": 0.05365912856291298, "rouge1_precision_stderr": 0.0020564569523574905, "rouge1_recall": 0.08521079397947685, "rouge1_recall_stderr": 0.0029171347337391424, "rouge2_fmeasure": 0.013711405505692513, "rouge2_fmeasure_stderr": 0.0006800023215766142, "rouge2_precision": 0.01308553916410283, "rouge2_precision_stderr": 0.0008304167357866675, "rouge2_recall": 0.020935634903111518, "rouge2_recall_stderr": 0.0011075958722668695, "rougeL_fmeasure": 0.043361310268114094, "rougeL_fmeasure_stderr": 0.001449080298946082, "rougeL_precision": 0.04107368844828485, "rougeL_precision_stderr": 0.0016203002005707164, "rougeL_recall": 0.06607615650560776, "rougeL_recall_stderr": 0.0022899509349880477, "rougeLsum_fmeasure": 0.053462722576117123, "rougeLsum_fmeasure_stderr": 0.0018157055884233443, "rougeLsum_precision": 0.050471233695121936, "rougeLsum_precision_stderr": 0.00195623667484853, "rougeLsum_recall": 0.07983111967154978, "rougeLsum_recall_stderr": 0.002740536059601005}}, "5": {"tldr_en": {"bleu": 7.052457708568259e-07, "bleu_stderr": 1.5580506381291971e-06, "rouge1_fmeasure": 0.009197238603875856, "rouge1_fmeasure_stderr": 0.0008566025651298484, "rouge1_precision": 0.009021675137974104, "rouge1_precision_stderr": 0.0009273930612309317, "rouge1_recall": 0.013663969355703246, "rouge1_recall_stderr": 0.0012759958727220507, "rouge2_fmeasure": 0.002246221879718567, "rouge2_fmeasure_stderr": 0.00027392493872686923, "rouge2_precision": 0.0022865203948137253, "rouge2_precision_stderr": 0.00031974081575163405, "rouge2_recall": 0.00333269885388563, "rouge2_recall_stderr": 0.0004364822352904779, "rougeL_fmeasure": 0.0070346110705439505, "rougeL_fmeasure_stderr": 0.0006447622583037963, "rougeL_precision": 0.00692394473380661, "rougeL_precision_stderr": 0.0007047267826868438, "rougeL_recall": 0.010765427828768026, "rougeL_recall_stderr": 0.0010267586038697376, "rougeLsum_fmeasure": 0.008670846152769262, "rougeLsum_fmeasure_stderr": 0.000805897930393055, "rougeLsum_precision": 0.008571242687553706, "rougeLsum_precision_stderr": 0.0008889426200571478, "rougeLsum_recall": 0.012905788312130623, "rougeLsum_recall_stderr": 0.001206811784034381}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.23848794691656622, "bleu_stderr": 0.03586833580767261, "rouge1_fmeasure": 0.02476955743708725, "rouge1_fmeasure_stderr": 0.0007710556690163732, "rouge1_precision": 0.019286698361945742, "rouge1_precision_stderr": 0.0006077660476531964, "rouge1_recall": 0.03771027274112491, "rouge1_recall_stderr": 0.0012643509148260043, "rouge2_fmeasure": 0.0025241396159283234, "rouge2_fmeasure_stderr": 0.00028150795832003375, "rouge2_precision": 0.0018866790365766187, "rouge2_precision_stderr": 0.0002143326315912907, "rouge2_recall": 0.004268812953176167, "rouge2_recall_stderr": 0.00046738906149789073, "rougeL_fmeasure": 0.02408262386824008, "rougeL_fmeasure_stderr": 0.0007136995922629838, "rougeL_precision": 0.018711596296599558, "rougeL_precision_stderr": 0.000549656942001388, "rougeL_recall": 0.036704663603431754, "rougeL_recall_stderr": 0.0011874696198706125, "rougeLsum_fmeasure": 0.022774229967586195, "rougeLsum_fmeasure_stderr": 0.0006671086701287083, "rougeLsum_precision": 0.017739797167011156, "rougeLsum_precision_stderr": 0.0005177890835136993, "rougeLsum_recall": 0.03456546163273569, "rougeLsum_recall_stderr": 0.0011034522576739757}}, "1": {"generate_text_restaurant": {"bleu": 7.611947046679122, "bleu_stderr": 0.1143274370287527, "rouge1_fmeasure": 0.37951927025471477, "rouge1_fmeasure_stderr": 0.002437652542254183, "rouge1_precision": 0.39546550497235883, "rouge1_precision_stderr": 0.00338263349515896, "rouge1_recall": 0.41781588842973927, "rouge1_recall_stderr": 0.0028231707574303336, "rouge2_fmeasure": 0.162134939165695, "rouge2_fmeasure_stderr": 0.0018063315065409062, "rouge2_precision": 0.17176180426713916, "rouge2_precision_stderr": 0.002231883030057573, "rouge2_recall": 0.17748075850051998, "rouge2_recall_stderr": 0.0020235877252369762, "rougeL_fmeasure": 0.2827204639054135, "rougeL_fmeasure_stderr": 0.001840892993788695, "rougeL_precision": 0.29255451415473116, "rougeL_precision_stderr": 0.0025475028016928546, "rougeL_recall": 0.31556464887995433, "rougeL_recall_stderr": 0.0023200773160819157, "rougeLsum_fmeasure": 0.3085637675341545, "rougeLsum_fmeasure_stderr": 0.002263042771686147, "rougeLsum_precision": 0.3218545869658081, "rougeLsum_precision_stderr": 0.0030063312376711593, "rougeLsum_recall": 0.33940243915149054, "rougeLsum_recall_stderr": 0.002593279963684452}}, "2": {"generate_text_restaurant": {"bleu": 6.018041940580495, "bleu_stderr": 0.07197788580096429, "rouge1_fmeasure": 0.32521240877335217, "rouge1_fmeasure_stderr": 0.0019278160903662194, "rouge1_precision": 0.26958144890836977, "rouge1_precision_stderr": 0.0021540233079166827, "rouge1_recall": 0.4588728570611039, "rouge1_recall_stderr": 0.0027449063068318643, "rouge2_fmeasure": 0.13955166174444095, "rouge2_fmeasure_stderr": 0.0014269312475032364, "rouge2_precision": 0.11570444930629259, "rouge2_precision_stderr": 0.0014199073558424482, "rouge2_recall": 0.2004243711941686, "rouge2_recall_stderr": 0.0021042737791853398, "rougeL_fmeasure": 0.262564054866777, "rougeL_fmeasure_stderr": 0.0014381342603292985, "rougeL_precision": 0.2156185303251572, "rougeL_precision_stderr": 0.0015614536366490293, "rougeL_recall": 0.3759398467983639, "rougeL_recall_stderr": 0.002381221921078589, "rougeLsum_fmeasure": 0.2680607212245618, "rougeLsum_fmeasure_stderr": 0.0018300140862428237, "rougeLsum_precision": 0.22224277917141733, "rougeLsum_precision_stderr": 0.0019418664704165772, "rougeLsum_recall": 0.3788810896543758, "rougeLsum_recall_stderr": 0.0026592931094796103}}, "3": {"generate_text_restaurant": {"bleu": 6.247143410389527, "bleu_stderr": 0.09336005465042321, "rouge1_fmeasure": 0.32338937321619443, "rouge1_fmeasure_stderr": 0.0017934056667561753, "rouge1_precision": 0.25907184832717167, "rouge1_precision_stderr": 0.001803996044494107, "rouge1_recall": 0.4710593035922337, "rouge1_recall_stderr": 0.0027215741063189363, "rouge2_fmeasure": 0.14093386031313695, "rouge2_fmeasure_stderr": 0.0013638568945346288, "rouge2_precision": 0.11204434793169746, "rouge2_precision_stderr": 0.0012125669003284166, "rouge2_recall": 0.21007032469689518, "rouge2_recall_stderr": 0.0021257583048451844, "rougeL_fmeasure": 0.2638096769634126, "rougeL_fmeasure_stderr": 0.0013632619425518314, "rougeL_precision": 0.2101184628317088, "rougeL_precision_stderr": 0.001355833935975265, "rougeL_recall": 0.3886278435049077, "rougeL_recall_stderr": 0.0023407273478071285, "rougeLsum_fmeasure": 0.2689516848427162, "rougeLsum_fmeasure_stderr": 0.0017385224354802351, "rougeLsum_precision": 0.21561354041284292, "rougeLsum_precision_stderr": 0.0016830247054440658, "rougeLsum_recall": 0.3919032879874159, "rougeLsum_recall_stderr": 0.002632582004466588}}, "4": {"generate_text_restaurant": {"bleu": 6.208222380745828, "bleu_stderr": 0.09813424389228141, "rouge1_fmeasure": 0.32023650433333456, "rouge1_fmeasure_stderr": 0.0017784408554274436, "rouge1_precision": 0.25403488264023616, "rouge1_precision_stderr": 0.00174969774558582, "rouge1_recall": 0.4709476911476187, "rouge1_recall_stderr": 0.002655138227534261, "rouge2_fmeasure": 0.1403526845297047, "rouge2_fmeasure_stderr": 0.0013607857847417898, "rouge2_precision": 0.11058685749210603, "rouge2_precision_stderr": 0.0012080055063255909, "rouge2_recall": 0.21167766216290337, "rouge2_recall_stderr": 0.002144496564072756, "rougeL_fmeasure": 0.26122109578349895, "rougeL_fmeasure_stderr": 0.0013494634641885617, "rougeL_precision": 0.20576507091031662, "rougeL_precision_stderr": 0.0012824323500063948, "rougeL_recall": 0.3889751930979724, "rougeL_recall_stderr": 0.0023221115722203652, "rougeLsum_fmeasure": 0.2670143871011222, "rougeLsum_fmeasure_stderr": 0.0017625078468513325, "rougeLsum_precision": 0.21178966636346616, "rougeLsum_precision_stderr": 0.0016557432144235155, "rougeLsum_recall": 0.393450062046956, "rougeLsum_recall_stderr": 0.002663580316470773}}, "5": {"generate_text_restaurant": {"bleu": 6.148604351164331, "bleu_stderr": 0.11104852380350659, "rouge1_fmeasure": 0.3199925141836084, "rouge1_fmeasure_stderr": 0.0017551605827543929, "rouge1_precision": 0.2528456956077577, "rouge1_precision_stderr": 0.0017424978255548198, "rouge1_recall": 0.473017800721126, "rouge1_recall_stderr": 0.0026123356126015254, "rouge2_fmeasure": 0.13979285541625697, "rouge2_fmeasure_stderr": 0.0013089259606601385, "rouge2_precision": 0.10974089008299018, "rouge2_precision_stderr": 0.0011789832834562743, "rouge2_recall": 0.2119589387667116, "rouge2_recall_stderr": 0.0020903039724611107, "rougeL_fmeasure": 0.2608746206749859, "rougeL_fmeasure_stderr": 0.0013206910936061742, "rougeL_precision": 0.20463239752897472, "rougeL_precision_stderr": 0.0012743661964013022, "rougeL_recall": 0.3904493381375762, "rougeL_recall_stderr": 0.002278919766787325, "rougeLsum_fmeasure": 0.26667287137591805, "rougeLsum_fmeasure_stderr": 0.001737453548353309, "rougeLsum_precision": 0.21056870894114044, "rougeLsum_precision_stderr": 0.001637983619116459, "rougeLsum_recall": 0.39500142434961627, "rougeLsum_recall_stderr": 0.002625905199940031}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8439516808341079, "bleu_stderr": 0.09422907248075703, "rouge1_fmeasure": 0.20643072459177322, "rouge1_fmeasure_stderr": 0.0023952712703594957, "rouge1_precision": 0.1489888413499883, "rouge1_precision_stderr": 0.0018521966676135623, "rouge1_recall": 0.35598499223601365, "rouge1_recall_stderr": 0.004207211166343017, "rouge2_fmeasure": 0.04580671342088997, "rouge2_fmeasure_stderr": 0.0015030605732593261, "rouge2_precision": 0.0324623282960817, "rouge2_precision_stderr": 0.0010674592282863239, "rouge2_recall": 0.0820809502686687, "rouge2_recall_stderr": 0.002813424567852905, "rougeL_fmeasure": 0.15483993335255772, "rougeL_fmeasure_stderr": 0.0018017433912559451, "rougeL_precision": 0.11153813287033311, "rougeL_precision_stderr": 0.001374306634500298, "rougeL_recall": 0.26864922067574026, "rougeL_recall_stderr": 0.0033157904418926283, "rougeLsum_fmeasure": 0.16399947665135114, "rougeLsum_fmeasure_stderr": 0.002023909905331339, "rougeLsum_precision": 0.1179757836593057, "rougeLsum_precision_stderr": 0.0015154466049348198, "rougeLsum_recall": 0.2847650880272033, "rougeLsum_recall_stderr": 0.003712939966528145}}, "1": {"article_DOC_summary": {"bleu": 1.247854057178413, "bleu_stderr": 0.053793972634237734, "rouge1_fmeasure": 0.1655189451131417, "rouge1_fmeasure_stderr": 0.0023018941372401455, "rouge1_precision": 0.1177467243515202, "rouge1_precision_stderr": 0.0017455506749855451, "rouge1_recall": 0.29171927966859307, "rouge1_recall_stderr": 0.003919612515883716, "rouge2_fmeasure": 0.03052402695525932, "rouge2_fmeasure_stderr": 0.0012776222243206263, "rouge2_precision": 0.02141171153130294, "rouge2_precision_stderr": 0.0008955978971257212, "rouge2_recall": 0.05531529147706192, "rouge2_recall_stderr": 0.002372274102893343, "rougeL_fmeasure": 0.13085315336542225, "rougeL_fmeasure_stderr": 0.0017506812383595754, "rougeL_precision": 0.0927268161322543, "rougeL_precision_stderr": 0.0012892522930530307, "rougeL_recall": 0.232421966160188, "rougeL_recall_stderr": 0.0031136409947685123, "rougeLsum_fmeasure": 0.13257722034008568, "rougeLsum_fmeasure_stderr": 0.0018892826906714033, "rougeLsum_precision": 0.09395615061271292, "rougeLsum_precision_stderr": 0.0013824641506762928, "rougeLsum_recall": 0.23524314570007646, "rougeLsum_recall_stderr": 0.0033502788795521077}}, "2": {"article_DOC_summary": {"bleu": 1.1975588661664014, "bleu_stderr": 0.06263558549987819, "rouge1_fmeasure": 0.165998653508353, "rouge1_fmeasure_stderr": 0.002288522145343238, "rouge1_precision": 0.1177626659310641, "rouge1_precision_stderr": 0.001707946986197866, "rouge1_recall": 0.2928721346826048, "rouge1_recall_stderr": 0.0039006436966328977, "rouge2_fmeasure": 0.029102694813621793, "rouge2_fmeasure_stderr": 0.0012269354923261043, "rouge2_precision": 0.020528446471707117, "rouge2_precision_stderr": 0.000867911954165022, "rouge2_recall": 0.052169823470780564, "rouge2_recall_stderr": 0.0022808799038575088, "rougeL_fmeasure": 0.12917404609126437, "rougeL_fmeasure_stderr": 0.0016814421892826735, "rougeL_precision": 0.09145048851843915, "rougeL_precision_stderr": 0.0012506279990524979, "rougeL_recall": 0.22942628081026484, "rougeL_recall_stderr": 0.002973056999961359, "rougeLsum_fmeasure": 0.13296050139540747, "rougeLsum_fmeasure_stderr": 0.001845998235299578, "rougeLsum_precision": 0.0940902852266538, "rougeLsum_precision_stderr": 0.0013601724232084945, "rougeLsum_recall": 0.23630864651546127, "rougeLsum_recall_stderr": 0.003290166668177434}}, "3": {"article_DOC_summary": {"bleu": 1.1532375468354903, "bleu_stderr": 0.09169402373692534, "rouge1_fmeasure": 0.15647200121804894, "rouge1_fmeasure_stderr": 0.0023727238595388817, "rouge1_precision": 0.1131657921685037, "rouge1_precision_stderr": 0.0018249204798791675, "rouge1_recall": 0.2722598710120147, "rouge1_recall_stderr": 0.004133665114298229, "rouge2_fmeasure": 0.027201929614022874, "rouge2_fmeasure_stderr": 0.0012539141722579825, "rouge2_precision": 0.019363405482896792, "rouge2_precision_stderr": 0.0009105840932812386, "rouge2_recall": 0.049210613825466105, "rouge2_recall_stderr": 0.002366221262626497, "rougeL_fmeasure": 0.12268909574632023, "rougeL_fmeasure_stderr": 0.001773225086338, "rougeL_precision": 0.08876222177060916, "rougeL_precision_stderr": 0.0013761155158951413, "rougeL_recall": 0.21448631404181784, "rougeL_recall_stderr": 0.003198193852945419, "rougeLsum_fmeasure": 0.12593169516392616, "rougeLsum_fmeasure_stderr": 0.001962773566689985, "rougeLsum_precision": 0.09102722612426341, "rougeLsum_precision_stderr": 0.0014951062687195294, "rougeLsum_recall": 0.22021791608209987, "rougeLsum_recall_stderr": 0.003544530658562865}}, "4": {"article_DOC_summary": {"bleu": 0.6055404023350197, "bleu_stderr": 0.09409517819503949, "rouge1_fmeasure": 0.04426046707797172, "rouge1_fmeasure_stderr": 0.0024997989992097624, "rouge1_precision": 0.0383955897314317, "rouge1_precision_stderr": 0.0025422863710305266, "rouge1_recall": 0.06915759782304308, "rouge1_recall_stderr": 0.0039608171509673586, "rouge2_fmeasure": 0.008141029291990268, "rouge2_fmeasure_stderr": 0.0008202019731008721, "rouge2_precision": 0.006481358010154669, "rouge2_precision_stderr": 0.0007393187434393857, "rouge2_recall": 0.013044537342091751, "rouge2_recall_stderr": 0.00125152402986618, "rougeL_fmeasure": 0.035265847972026074, "rougeL_fmeasure_stderr": 0.0020006364168593496, "rougeL_precision": 0.031113719615522017, "rougeL_precision_stderr": 0.0021898843752975964, "rougeL_recall": 0.055197273257753664, "rougeL_recall_stderr": 0.003191098630834803, "rougeLsum_fmeasure": 0.03632930315349291, "rougeLsum_fmeasure_stderr": 0.0020756841173550696, "rougeLsum_precision": 0.03210804984465074, "rougeLsum_precision_stderr": 0.0022623416009452243, "rougeLsum_recall": 0.05680048575290953, "rougeLsum_recall_stderr": 0.0032931638988068055}}, "5": {"article_DOC_summary": {"bleu": 1.1094899638874875e-37, "bleu_stderr": 1.0662006738968273e-31, "rouge1_fmeasure": 0.0024831420596845987, "rouge1_fmeasure_stderr": 0.0007070331562075694, "rouge1_precision": 0.002821793157093116, "rouge1_precision_stderr": 0.0008106295472902334, "rouge1_recall": 0.0022851464113943067, "rouge1_recall_stderr": 0.0006484962567584845, "rouge2_fmeasure": 0.0002519246692387647, "rouge2_fmeasure_stderr": 0.00017508831809454146, "rouge2_precision": 0.0002914830883754302, "rouge2_precision_stderr": 0.00020340968454234058, "rouge2_recall": 0.00022401756585996315, "rouge2_recall_stderr": 0.0001545711517822221, "rougeL_fmeasure": 0.001910475794160408, "rougeL_fmeasure_stderr": 0.0005775932958294451, "rougeL_precision": 0.0021547453202339587, "rougeL_precision_stderr": 0.0006563220446223475, "rougeL_recall": 0.0017708458456009723, "rougeL_recall_stderr": 0.0005325445996971974, "rougeLsum_fmeasure": 0.0020447516574242644, "rougeLsum_fmeasure_stderr": 0.000612874204929875, "rougeLsum_precision": 0.002307213397230338, "rougeLsum_precision_stderr": 0.0006934611042748366, "rougeLsum_recall": 0.001892317665185999, "rougeLsum_recall_stderr": 0.0005664327179419651}}}}
2b855b9bc4seed2/evaluation/generation/merged.csv CHANGED
@@ -27,9 +27,27 @@ gem_xsum,5,median,rouge2_fmeasure,0.00017753481649226955
27
  gem_xsum,5,average,multiple,0.025395720198443345
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.048745998009714464
29
  web_nlg_en,0,median,rouge2_fmeasure,0.048745998009714464
30
- web_nlg_en,0,average,multiple,0.048745998009714464
 
 
 
 
 
 
 
 
 
 
31
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03551854752059051
32
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03551854752059051
33
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04208923831822898
34
  wiki_lingua_en,1,median,rouge2_fmeasure,0.04208923831822898
35
- wiki_lingua_en,1,average,multiple,0.03880389291940975
 
 
 
 
 
 
 
 
 
27
  gem_xsum,5,average,multiple,0.025395720198443345
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.048745998009714464
29
  web_nlg_en,0,median,rouge2_fmeasure,0.048745998009714464
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.04974187660065682
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.04974187660065682
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.052439071054187616
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.052439071054187616
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05264782543517679
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05264782543517679
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0542723753051547
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.0542723753051547
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05405387695384774
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05405387695384774
40
+ web_nlg_en,5,average,multiple,0.05198350389312302
41
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03551854752059051
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03551854752059051
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04208923831822898
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.04208923831822898
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04596426564369226
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.04596426564369226
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.038198357822552355
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.038198357822552355
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012389475323072278
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.012389475323072278
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0019437622260819584
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0019437622260819584
53
+ wiki_lingua_en,5,average,multiple,0.02935060780903639
2b855b9bc4seed2/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3750956835220534, "bleu_stderr": 0.03303866348324263, "rouge1_fmeasure": 0.1034915916030452, "rouge1_fmeasure_stderr": 0.0020463645545355925, "rouge1_precision": 0.06729841211664779, "rouge1_precision_stderr": 0.0014779125598332707, "rouge1_recall": 0.29304332001997996, "rouge1_recall_stderr": 0.004946222762455597, "rouge2_fmeasure": 0.048745998009714464, "rouge2_fmeasure_stderr": 0.0012510511387985711, "rouge2_precision": 0.031583798963941895, "rouge2_precision_stderr": 0.0008797642983980407, "rouge2_recall": 0.14233929411373747, "rouge2_recall_stderr": 0.003345640165065958, "rougeL_fmeasure": 0.10020083541023005, "rougeL_fmeasure_stderr": 0.0019268832996130537, "rougeL_precision": 0.06496957050414408, "rougeL_precision_stderr": 0.0013738739239028245, "rougeL_recall": 0.2861388731751563, "rougeL_recall_stderr": 0.004839232451958966, "rougeLsum_fmeasure": 0.09872549675778793, "rougeLsum_fmeasure_stderr": 0.0019292683793847847, "rougeLsum_precision": 0.06418206519400182, "rougeLsum_precision_stderr": 0.001390783570882245, "rougeLsum_recall": 0.2794417864990063, "rougeLsum_recall_stderr": 0.004636157223173928}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5392368463230133, "bleu_stderr": 0.053948912749046035, "rouge1_fmeasure": 0.17949455554206634, "rouge1_fmeasure_stderr": 0.001796797335172746, "rouge1_precision": 0.15312528607963066, "rouge1_precision_stderr": 0.0018475398053558751, "rouge1_recall": 0.26069676849871193, "rouge1_recall_stderr": 0.002553411390285405, "rouge2_fmeasure": 0.03551854752059051, "rouge2_fmeasure_stderr": 0.0008299460804507645, "rouge2_precision": 0.03025768195820637, "rouge2_precision_stderr": 0.000737285788609388, "rouge2_recall": 0.0526377509392788, "rouge2_recall_stderr": 0.0013707148216665472, "rougeL_fmeasure": 0.14125944815133307, "rougeL_fmeasure_stderr": 0.001293606648023059, "rougeL_precision": 0.11900796171626439, "rougeL_precision_stderr": 0.0012961585979789718, "rougeL_recall": 0.21022152388268012, "rougeL_recall_stderr": 0.0020859201845126016, "rougeLsum_fmeasure": 0.16495917517256573, "rougeLsum_fmeasure_stderr": 0.001644741818116919, "rougeLsum_precision": 0.14065190421878745, "rougeLsum_precision_stderr": 0.0016987560973759559, "rougeLsum_recall": 0.24028450248920846, "rougeLsum_recall_stderr": 0.0023647123157988985}}, "1": {"tldr_en": {"bleu": 2.0614970786142583, "bleu_stderr": 0.09284907950143785, "rouge1_fmeasure": 0.19472791234999667, "rouge1_fmeasure_stderr": 0.001906191600167485, "rouge1_precision": 0.1668110350174811, "rouge1_precision_stderr": 0.0019877097402536763, "rouge1_recall": 0.28257427077519554, "rouge1_recall_stderr": 0.002760368419127019, "rouge2_fmeasure": 0.04208923831822898, "rouge2_fmeasure_stderr": 0.0009197645366087788, "rouge2_precision": 0.035928254592281224, "rouge2_precision_stderr": 0.0008382143224304085, "rouge2_recall": 0.06299590355552512, "rouge2_recall_stderr": 0.0015297554624304605, "rougeL_fmeasure": 0.14254660444247075, "rougeL_fmeasure_stderr": 0.0012952641600081254, "rougeL_precision": 0.12071476283575872, "rougeL_precision_stderr": 0.0013170745626523695, "rougeL_recall": 0.21195380139438252, "rougeL_recall_stderr": 0.002156914307318323, "rougeLsum_fmeasure": 0.18187224601097693, "rougeLsum_fmeasure_stderr": 0.0017770105186546616, "rougeLsum_precision": 0.15552475435698954, "rougeLsum_precision_stderr": 0.0018425071564360774, "rougeLsum_recall": 0.2646754231072801, "rougeLsum_recall_stderr": 0.0026025376230503126}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.027496856259569415, "bleu_stderr": 0.007385963274542756, "rouge1_fmeasure": 0.02786087179209838, "rouge1_fmeasure_stderr": 0.0005858506970554961, "rouge1_precision": 0.047807397913058214, "rouge1_precision_stderr": 0.0011364617294874694, "rouge1_recall": 0.022673512097835077, "rouge1_recall_stderr": 0.0005137246819540603, "rouge2_fmeasure": 0.0002573048402040541, "rouge2_fmeasure_stderr": 6.575032923202268e-05, "rouge2_precision": 0.0005045565387850128, "rouge2_precision_stderr": 0.00014549502538483447, "rouge2_recall": 0.00021403935275164058, "rouge2_recall_stderr": 5.468859488917244e-05, "rougeL_fmeasure": 0.02773258214801112, "rougeL_fmeasure_stderr": 0.0005799438210362486, "rougeL_precision": 0.04750467841033873, "rougeL_precision_stderr": 0.0011193107075921842, "rougeL_recall": 0.022556340907519876, "rougeL_recall_stderr": 0.000501594662926671, "rougeLsum_fmeasure": 0.027138243313024377, "rougeLsum_fmeasure_stderr": 0.0005613131933490365, "rougeLsum_precision": 0.04681714649780676, "rougeLsum_precision_stderr": 0.0011159994745820552, "rougeLsum_recall": 0.02201246000410449, "rougeLsum_recall_stderr": 0.00047810257232395407}}, "1": {"generate_text_restaurant": {"bleu": 5.314362512162757, "bleu_stderr": 0.0769982113030766, "rouge1_fmeasure": 0.3138272169710467, "rouge1_fmeasure_stderr": 0.0018338780001365314, "rouge1_precision": 0.25918480539023137, "rouge1_precision_stderr": 0.002016038150145768, "rouge1_recall": 0.45092555851546073, "rouge1_recall_stderr": 0.002775382584769768, "rouge2_fmeasure": 0.12479903412317217, "rouge2_fmeasure_stderr": 0.0013250592629703154, "rouge2_precision": 0.10262303268628689, "rouge2_precision_stderr": 0.0012494801287872546, "rouge2_recall": 0.18308739944422936, "rouge2_recall_stderr": 0.0020326480850756325, "rougeL_fmeasure": 0.2503482721276052, "rougeL_fmeasure_stderr": 0.0013578494077290393, "rougeL_precision": 0.20524097551911571, "rougeL_precision_stderr": 0.0014796154894487817, "rougeL_recall": 0.3644112328636815, "rougeL_recall_stderr": 0.0023556498368056318, "rougeLsum_fmeasure": 0.2572370513641328, "rougeLsum_fmeasure_stderr": 0.0017528475888098256, "rougeLsum_precision": 0.21324966168952111, "rougeLsum_precision_stderr": 0.0018757528611749424, "rougeLsum_recall": 0.3686148255109357, "rougeLsum_recall_stderr": 0.0025955691777193217}}, "2": {"generate_text_restaurant": {"bleu": 6.325830512149402, "bleu_stderr": 0.09465026494214827, "rouge1_fmeasure": 0.35038288574502346, "rouge1_fmeasure_stderr": 0.0017512378126847482, "rouge1_precision": 0.29476393089554137, "rouge1_precision_stderr": 0.0018788751048144279, "rouge1_recall": 0.47796061164305575, "rouge1_recall_stderr": 0.002752937100665692, "rouge2_fmeasure": 0.14648456630581705, "rouge2_fmeasure_stderr": 0.001382565495597909, "rouge2_precision": 0.1225316105353968, "rouge2_precision_stderr": 0.0012759941001301229, "rouge2_recall": 0.20377380172237158, "rouge2_recall_stderr": 0.0021030901186470078, "rougeL_fmeasure": 0.25470473636536506, "rougeL_fmeasure_stderr": 0.0013682076501230478, "rougeL_precision": 0.2139893115508532, "rougeL_precision_stderr": 0.0014438796540664535, "rougeL_recall": 0.3497428021452183, "rougeL_recall_stderr": 0.002279119692449497, "rougeLsum_fmeasure": 0.2954049051400817, "rougeLsum_fmeasure_stderr": 0.0016940279750374688, "rougeLsum_precision": 0.24914819370477054, "rougeLsum_precision_stderr": 0.001781429279282853, "rougeLsum_recall": 0.40204602010365786, "rougeLsum_recall_stderr": 0.002550588410355925}}, "3": {"generate_text_restaurant": {"bleu": 6.220755100721931, "bleu_stderr": 0.08670753957301724, "rouge1_fmeasure": 0.35031384994100806, "rouge1_fmeasure_stderr": 0.0016629497950809106, "rouge1_precision": 0.28326448174434227, "rouge1_precision_stderr": 0.0015904147315234246, "rouge1_recall": 0.4912377845146275, "rouge1_recall_stderr": 0.0026742890310022037, "rouge2_fmeasure": 0.14718125822293368, "rouge2_fmeasure_stderr": 0.0013638027262871243, "rouge2_precision": 0.11808337262877049, "rouge2_precision_stderr": 0.0011828407846607325, "rouge2_recall": 0.211495428182817, "rouge2_recall_stderr": 0.0021550210917977604, "rougeL_fmeasure": 0.24600845724972414, "rougeL_fmeasure_stderr": 0.0013072908419869695, "rougeL_precision": 0.19847946216112136, "rougeL_precision_stderr": 0.001208194793948507, "rougeL_recall": 0.3473816584720071, "rougeL_recall_stderr": 0.0022369392784332546, "rougeLsum_fmeasure": 0.2961838000715837, "rougeLsum_fmeasure_stderr": 0.0016190692110987968, "rougeLsum_precision": 0.23966070087085453, "rougeLsum_precision_stderr": 0.0015055005166558567, "rougeLsum_recall": 0.4149981951143496, "rougeLsum_recall_stderr": 0.0025291151450575807}}, "4": {"generate_text_restaurant": {"bleu": 6.389458547015763, "bleu_stderr": 0.07567250209055609, "rouge1_fmeasure": 0.3515156760106122, "rouge1_fmeasure_stderr": 0.0016532451325087115, "rouge1_precision": 0.28593161432431424, "rouge1_precision_stderr": 0.0015919612522573242, "rouge1_recall": 0.4874525990823487, "rouge1_recall_stderr": 0.002594781648430376, "rouge2_fmeasure": 0.14861120179755016, "rouge2_fmeasure_stderr": 0.0013885860121316607, "rouge2_precision": 0.11974496863366363, "rouge2_precision_stderr": 0.0011676406530046445, "rouge2_recall": 0.21088419111689036, "rouge2_recall_stderr": 0.002150241945207948, "rougeL_fmeasure": 0.24771148695595446, "rougeL_fmeasure_stderr": 0.0013410031747652237, "rougeL_precision": 0.20089041282335124, "rougeL_precision_stderr": 0.001212814806634473, "rougeL_recall": 0.34593464970921395, "rougeL_recall_stderr": 0.002240164116339641, "rougeLsum_fmeasure": 0.2976927202262742, "rougeLsum_fmeasure_stderr": 0.0016328209282317906, "rougeLsum_precision": 0.24209629823192363, "rougeLsum_precision_stderr": 0.0015046779900312415, "rougeLsum_recall": 0.41292727791904127, "rougeLsum_recall_stderr": 0.002515568479286581}}, "5": {"generate_text_restaurant": {"bleu": 6.5778109853818805, "bleu_stderr": 0.07853639654126673, "rouge1_fmeasure": 0.3522694213042175, "rouge1_fmeasure_stderr": 0.0016367915675506936, "rouge1_precision": 0.2903438083004083, "rouge1_precision_stderr": 0.0016258609800168166, "rouge1_recall": 0.4798034781155066, "rouge1_recall_stderr": 0.002558773195235382, "rouge2_fmeasure": 0.15043417065549097, "rouge2_fmeasure_stderr": 0.0013696094059916363, "rouge2_precision": 0.12304693214365679, "rouge2_precision_stderr": 0.0011814359326877534, "rouge2_recall": 0.20889730658325914, "rouge2_recall_stderr": 0.002078829798291329, "rougeL_fmeasure": 0.2515843186162369, "rougeL_fmeasure_stderr": 0.001348337199469822, "rougeL_precision": 0.20668270832400937, "rougeL_precision_stderr": 0.0012420561784777045, "rougeL_recall": 0.3449859636097895, "rougeL_recall_stderr": 0.002236880429413282, "rougeLsum_fmeasure": 0.2995796499530265, "rougeLsum_fmeasure_stderr": 0.0016286619231701276, "rougeLsum_precision": 0.24693348710209848, "rougeLsum_precision_stderr": 0.0015466223462410505, "rougeLsum_recall": 0.4079639743417248, "rougeLsum_recall_stderr": 0.0024745366963463723}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.672427500415346, "bleu_stderr": 0.06224047870202198, "rouge1_fmeasure": 0.20461196621127212, "rouge1_fmeasure_stderr": 0.002456416561114147, "rouge1_precision": 0.15446349983996752, "rouge1_precision_stderr": 0.002056543674678123, "rouge1_recall": 0.33511082297031264, "rouge1_recall_stderr": 0.0042735988519539215, "rouge2_fmeasure": 0.04183742358897129, "rouge2_fmeasure_stderr": 0.0014163052777246905, "rouge2_precision": 0.030568062524863993, "rouge2_precision_stderr": 0.00106051192463649, "rouge2_recall": 0.07227511022307367, "rouge2_recall_stderr": 0.0025089135207627426, "rougeL_fmeasure": 0.15065663850491834, "rougeL_fmeasure_stderr": 0.0017993673274426995, "rougeL_precision": 0.11362014484428674, "rougeL_precision_stderr": 0.001502727857377175, "rougeL_recall": 0.24803625345806893, "rougeL_recall_stderr": 0.003251203569494646, "rougeLsum_fmeasure": 0.1597417527650389, "rougeLsum_fmeasure_stderr": 0.002049977023045275, "rougeLsum_precision": 0.12014955371512309, "rougeLsum_precision_stderr": 0.0016511000633980721, "rougeLsum_recall": 0.26350730851862786, "rougeLsum_recall_stderr": 0.0037010867344718494}}, "1": {"article_DOC_summary": {"bleu": 1.275721416879001, "bleu_stderr": 0.06517346432102149, "rouge1_fmeasure": 0.17002150605190583, "rouge1_fmeasure_stderr": 0.002413215946011016, "rouge1_precision": 0.12085626806737927, "rouge1_precision_stderr": 0.0017891193548989438, "rouge1_recall": 0.2987160321435191, "rouge1_recall_stderr": 0.00413368541077085, "rouge2_fmeasure": 0.032840477830095596, "rouge2_fmeasure_stderr": 0.0013243307135319853, "rouge2_precision": 0.023063271052401108, "rouge2_precision_stderr": 0.0009280012800721673, "rouge2_recall": 0.05940351775728508, "rouge2_recall_stderr": 0.0024837332976990266, "rougeL_fmeasure": 0.13101534493327607, "rougeL_fmeasure_stderr": 0.0017667403668890078, "rougeL_precision": 0.09287906478368882, "rougeL_precision_stderr": 0.0012943640278186897, "rougeL_recall": 0.23214124490959148, "rougeL_recall_stderr": 0.00319891767484422, "rougeLsum_fmeasure": 0.13530824264400498, "rougeLsum_fmeasure_stderr": 0.0019563437490256476, "rougeLsum_precision": 0.09594077364128303, "rougeLsum_precision_stderr": 0.001428271472982218, "rougeLsum_recall": 0.2393756506317347, "rougeLsum_recall_stderr": 0.003497762504180999}}, "2": {"article_DOC_summary": {"bleu": 1.415586441135803, "bleu_stderr": 0.08272312685335785, "rouge1_fmeasure": 0.17593641988573824, "rouge1_fmeasure_stderr": 0.00234463310312545, "rouge1_precision": 0.12470672450008642, "rouge1_precision_stderr": 0.001730877605157259, "rouge1_recall": 0.31075289796152855, "rouge1_recall_stderr": 0.004103173454358926, "rouge2_fmeasure": 0.03581736634962746, "rouge2_fmeasure_stderr": 0.0013556261837556886, "rouge2_precision": 0.025120528884661486, "rouge2_precision_stderr": 0.0009506484753053252, "rouge2_recall": 0.06507926184404549, "rouge2_recall_stderr": 0.0025723662373737556, "rougeL_fmeasure": 0.13731969194870292, "rougeL_fmeasure_stderr": 0.0017728970309222165, "rougeL_precision": 0.097210056441488, "rougeL_precision_stderr": 0.00130179992393822, "rougeL_recall": 0.24368451935612578, "rougeL_recall_stderr": 0.0032187047798505693, "rougeLsum_fmeasure": 0.14160043868370525, "rougeLsum_fmeasure_stderr": 0.001972243082446741, "rougeLsum_precision": 0.10015575582491773, "rougeLsum_precision_stderr": 0.0014343870962585403, "rougeLsum_recall": 0.25163047768424157, "rougeLsum_recall_stderr": 0.0035877808840131804}}, "3": {"article_DOC_summary": {"bleu": 1.3855685926696633, "bleu_stderr": 0.09518631321191244, "rouge1_fmeasure": 0.17024859520096114, "rouge1_fmeasure_stderr": 0.002562946794229022, "rouge1_precision": 0.12396012732837984, "rouge1_precision_stderr": 0.0020737206796275333, "rouge1_recall": 0.294082787536851, "rouge1_recall_stderr": 0.004394659694676491, "rouge2_fmeasure": 0.03333710891779061, "rouge2_fmeasure_stderr": 0.0013557965156104658, "rouge2_precision": 0.02383270351288902, "rouge2_precision_stderr": 0.0009962269501358127, "rouge2_recall": 0.0597937637900738, "rouge2_recall_stderr": 0.0025128359808654207, "rougeL_fmeasure": 0.13130207255582982, "rougeL_fmeasure_stderr": 0.0018962132114596106, "rougeL_precision": 0.0951854301888063, "rougeL_precision_stderr": 0.001504553701052666, "rougeL_recall": 0.2288027311191167, "rougeL_recall_stderr": 0.0034070040841674277, "rougeLsum_fmeasure": 0.13553160155677008, "rougeLsum_fmeasure_stderr": 0.002110243338807585, "rougeLsum_precision": 0.09840275750442219, "rougeLsum_precision_stderr": 0.0017177817559235623, "rougeLsum_recall": 0.23599364564166173, "rougeLsum_recall_stderr": 0.00374171509560645}}, "4": {"article_DOC_summary": {"bleu": 0.6328883165870776, "bleu_stderr": 0.13295917920386843, "rouge1_fmeasure": 0.04511319204333184, "rouge1_fmeasure_stderr": 0.00253231999079197, "rouge1_precision": 0.0385226096298928, "rouge1_precision_stderr": 0.002445729090310371, "rouge1_recall": 0.07058918571425392, "rouge1_recall_stderr": 0.004055809965880756, "rouge2_fmeasure": 0.008364409687682841, "rouge2_fmeasure_stderr": 0.000821942672548505, "rouge2_precision": 0.006582553276982632, "rouge2_precision_stderr": 0.0006957960840224698, "rouge2_recall": 0.013801684775810714, "rouge2_recall_stderr": 0.0013847950972590455, "rougeL_fmeasure": 0.035199023564244016, "rougeL_fmeasure_stderr": 0.0019664417280659367, "rougeL_precision": 0.030493320960123818, "rougeL_precision_stderr": 0.0020172488216199987, "rougeL_recall": 0.05525735538867771, "rougeL_recall_stderr": 0.0031830815732572424, "rougeLsum_fmeasure": 0.037049573469593644, "rougeLsum_fmeasure_stderr": 0.0020924709342865803, "rougeLsum_precision": 0.032113373783877935, "rougeLsum_precision_stderr": 0.0021307041062483545, "rougeLsum_recall": 0.058070454031511114, "rougeLsum_recall_stderr": 0.0033830316361808826}}, "5": {"article_DOC_summary": {"bleu": 4.665459965858088e-38, "bleu_stderr": 5.5026833855786054e-33, "rouge1_fmeasure": 0.0018318259362959952, "rouge1_fmeasure_stderr": 0.0004972909761861116, "rouge1_precision": 0.002002107077973075, "rouge1_precision_stderr": 0.0005578257118175824, "rouge1_recall": 0.001762094811101669, "rouge1_recall_stderr": 0.00047783830113039527, "rouge2_fmeasure": 0.00017753481649226955, "rouge2_fmeasure_stderr": 8.909774211763047e-05, "rouge2_precision": 0.00020887285731902822, "rouge2_precision_stderr": 0.00010454399623682942, "rouge2_recall": 0.00016020143378633943, "rouge2_recall_stderr": 8.24302746259628e-05, "rougeL_fmeasure": 0.0015100528537815832, "rougeL_fmeasure_stderr": 0.00041465009328220766, "rougeL_precision": 0.0016875974156627233, "rougeL_precision_stderr": 0.00048448535968082044, "rougeL_recall": 0.001424633735571781, "rougeL_recall_stderr": 0.00038296569125613873, "rougeLsum_fmeasure": 0.0016080680461363983, "rougeLsum_fmeasure_stderr": 0.0004477924018194676, "rougeLsum_precision": 0.0017778745665158427, "rougeLsum_precision_stderr": 0.0005088500755445573, "rougeLsum_recall": 0.00153183785220986, "rougeLsum_recall_stderr": 0.0004253215571278959}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3750956835220534, "bleu_stderr": 0.03303866348324263, "rouge1_fmeasure": 0.1034915916030452, "rouge1_fmeasure_stderr": 0.0020463645545355925, "rouge1_precision": 0.06729841211664779, "rouge1_precision_stderr": 0.0014779125598332707, "rouge1_recall": 0.29304332001997996, "rouge1_recall_stderr": 0.004946222762455597, "rouge2_fmeasure": 0.048745998009714464, "rouge2_fmeasure_stderr": 0.0012510511387985711, "rouge2_precision": 0.031583798963941895, "rouge2_precision_stderr": 0.0008797642983980407, "rouge2_recall": 0.14233929411373747, "rouge2_recall_stderr": 0.003345640165065958, "rougeL_fmeasure": 0.10020083541023005, "rougeL_fmeasure_stderr": 0.0019268832996130537, "rougeL_precision": 0.06496957050414408, "rougeL_precision_stderr": 0.0013738739239028245, "rougeL_recall": 0.2861388731751563, "rougeL_recall_stderr": 0.004839232451958966, "rougeLsum_fmeasure": 0.09872549675778793, "rougeLsum_fmeasure_stderr": 0.0019292683793847847, "rougeLsum_precision": 0.06418206519400182, "rougeLsum_precision_stderr": 0.001390783570882245, "rougeLsum_recall": 0.2794417864990063, "rougeLsum_recall_stderr": 0.004636157223173928}}, "1": {"PALM_prompt": {"bleu": 0.4365766652740948, "bleu_stderr": 0.037844648511825396, "rouge1_fmeasure": 0.10877696688177962, "rouge1_fmeasure_stderr": 0.001945767451470737, "rouge1_precision": 0.06994832542929158, "rouge1_precision_stderr": 0.0014495808290369214, "rouge1_recall": 0.3490531761559243, "rouge1_recall_stderr": 0.005184113888991831, "rouge2_fmeasure": 0.04974187660065682, "rouge2_fmeasure_stderr": 0.001196316556026764, "rouge2_precision": 0.03193707122441378, "rouge2_precision_stderr": 0.0008629068963180332, "rouge2_recall": 0.1657017691626549, "rouge2_recall_stderr": 0.003586081586904503, "rougeL_fmeasure": 0.10225212989808725, "rougeL_fmeasure_stderr": 0.0017763245424793478, "rougeL_precision": 0.06565378035925949, "rougeL_precision_stderr": 0.001308891042633236, "rougeL_recall": 0.32681385252271544, "rougeL_recall_stderr": 0.004755175816761329, "rougeLsum_fmeasure": 0.1033250851795133, "rougeLsum_fmeasure_stderr": 0.0018350799221626799, "rougeLsum_precision": 0.06646392126839666, "rougeLsum_precision_stderr": 0.0013644803957867627, "rougeLsum_recall": 0.3298156773546831, "rougeLsum_recall_stderr": 0.004757624723143256}}, "2": {"PALM_prompt": {"bleu": 0.4720415619133146, "bleu_stderr": 0.03028898123227167, "rouge1_fmeasure": 0.11394894646533223, "rouge1_fmeasure_stderr": 0.001878183428143987, "rouge1_precision": 0.07270379572501368, "rouge1_precision_stderr": 0.0013940955739727785, "rouge1_recall": 0.3721320713880717, "rouge1_recall_stderr": 0.005025146233893169, "rouge2_fmeasure": 0.052439071054187616, "rouge2_fmeasure_stderr": 0.0011887777636420736, "rouge2_precision": 0.03335870092535265, "rouge2_precision_stderr": 0.0008494358081912536, "rouge2_recall": 0.17967810296863904, "rouge2_recall_stderr": 0.003647905777910192, "rougeL_fmeasure": 0.10629114450877862, "rougeL_fmeasure_stderr": 0.0017199169469615488, "rougeL_precision": 0.06778190190034863, "rougeL_precision_stderr": 0.0012652848544027523, "rougeL_recall": 0.34517810629847506, "rougeL_recall_stderr": 0.004569703942035579, "rougeLsum_fmeasure": 0.10821914727514863, "rougeLsum_fmeasure_stderr": 0.0017733019493284057, "rougeLsum_precision": 0.06905024518953087, "rougeLsum_precision_stderr": 0.0013136086935812618, "rougeLsum_recall": 0.35243452469564673, "rougeLsum_recall_stderr": 0.004681765880078895}}, "3": {"PALM_prompt": {"bleu": 0.505309723079554, "bleu_stderr": 0.030789048631243345, "rouge1_fmeasure": 0.1140811559986869, "rouge1_fmeasure_stderr": 0.0018314449750337714, "rouge1_precision": 0.07266150602286287, "rouge1_precision_stderr": 0.0013570702728215546, "rouge1_recall": 0.375335802973, "rouge1_recall_stderr": 0.0049513897844448915, "rouge2_fmeasure": 0.05264782543517679, "rouge2_fmeasure_stderr": 0.0011603135950920642, "rouge2_precision": 0.03339540011443451, "rouge2_precision_stderr": 0.0008203290563190548, "rouge2_recall": 0.1823532126204787, "rouge2_recall_stderr": 0.0036717068306557055, "rougeL_fmeasure": 0.10595583325313408, "rougeL_fmeasure_stderr": 0.0016668840428279569, "rougeL_precision": 0.0674357161794326, "rougeL_precision_stderr": 0.0012225512883614093, "rougeL_recall": 0.34653200198788353, "rougeL_recall_stderr": 0.004452191597706626, "rougeLsum_fmeasure": 0.1081446644399457, "rougeLsum_fmeasure_stderr": 0.0017244386092601673, "rougeLsum_precision": 0.0688830759913898, "rougeLsum_precision_stderr": 0.001272997217309663, "rougeLsum_recall": 0.3545702139819994, "rougeLsum_recall_stderr": 0.00457733786223568}}, "4": {"PALM_prompt": {"bleu": 0.5118075228451896, "bleu_stderr": 0.03421703924709246, "rouge1_fmeasure": 0.11791689956198792, "rouge1_fmeasure_stderr": 0.001817909406189755, "rouge1_precision": 0.074981616410397, "rouge1_precision_stderr": 0.0013590623406350294, "rouge1_recall": 0.3896720052301274, "rouge1_recall_stderr": 0.0049507467556851375, "rouge2_fmeasure": 0.0542723753051547, "rouge2_fmeasure_stderr": 0.0011401850272983066, "rouge2_precision": 0.03432282283699023, "rouge2_precision_stderr": 0.0008130842468814066, "rouge2_recall": 0.1906287391116607, "rouge2_recall_stderr": 0.003599080016877428, "rougeL_fmeasure": 0.10914538376163477, "rougeL_fmeasure_stderr": 0.0016168378348594852, "rougeL_precision": 0.06934159095655563, "rougeL_precision_stderr": 0.001199401487410265, "rougeL_recall": 0.359472896781564, "rougeL_recall_stderr": 0.004372935200678683, "rougeLsum_fmeasure": 0.1120022009379364, "rougeLsum_fmeasure_stderr": 0.0017057257373490598, "rougeLsum_precision": 0.07120084070095285, "rougeLsum_precision_stderr": 0.001270934154620542, "rougeLsum_recall": 0.3700069019770497, "rougeLsum_recall_stderr": 0.004598828278904394}}, "5": {"PALM_prompt": {"bleu": 0.5489315892257162, "bleu_stderr": 0.03566319150553619, "rouge1_fmeasure": 0.11748298831973306, "rouge1_fmeasure_stderr": 0.001804400803275616, "rouge1_precision": 0.0745923126280549, "rouge1_precision_stderr": 0.0013376269029467707, "rouge1_recall": 0.3905930314677698, "rouge1_recall_stderr": 0.004950215357717335, "rouge2_fmeasure": 0.05405387695384774, "rouge2_fmeasure_stderr": 0.0011402270130816087, "rouge2_precision": 0.03413131993305688, "rouge2_precision_stderr": 0.0008059901084375219, "rouge2_recall": 0.19113943375004094, "rouge2_recall_stderr": 0.0035679901920801166, "rougeL_fmeasure": 0.108203267837238, "rougeL_fmeasure_stderr": 0.0016224994947192633, "rougeL_precision": 0.06865758811298832, "rougeL_precision_stderr": 0.0011980020885612244, "rougeL_recall": 0.35942245571318887, "rougeL_recall_stderr": 0.00443107907757396, "rougeLsum_fmeasure": 0.11077970473967182, "rougeLsum_fmeasure_stderr": 0.0016901774942724233, "rougeLsum_precision": 0.07030942281210417, "rougeLsum_precision_stderr": 0.0012515624821265068, "rougeLsum_recall": 0.3692281620151466, "rougeLsum_recall_stderr": 0.004608117532229734}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5392368463230133, "bleu_stderr": 0.053948912749046035, "rouge1_fmeasure": 0.17949455554206634, "rouge1_fmeasure_stderr": 0.001796797335172746, "rouge1_precision": 0.15312528607963066, "rouge1_precision_stderr": 0.0018475398053558751, "rouge1_recall": 0.26069676849871193, "rouge1_recall_stderr": 0.002553411390285405, "rouge2_fmeasure": 0.03551854752059051, "rouge2_fmeasure_stderr": 0.0008299460804507645, "rouge2_precision": 0.03025768195820637, "rouge2_precision_stderr": 0.000737285788609388, "rouge2_recall": 0.0526377509392788, "rouge2_recall_stderr": 0.0013707148216665472, "rougeL_fmeasure": 0.14125944815133307, "rougeL_fmeasure_stderr": 0.001293606648023059, "rougeL_precision": 0.11900796171626439, "rougeL_precision_stderr": 0.0012961585979789718, "rougeL_recall": 0.21022152388268012, "rougeL_recall_stderr": 0.0020859201845126016, "rougeLsum_fmeasure": 0.16495917517256573, "rougeLsum_fmeasure_stderr": 0.001644741818116919, "rougeLsum_precision": 0.14065190421878745, "rougeLsum_precision_stderr": 0.0016987560973759559, "rougeLsum_recall": 0.24028450248920846, "rougeLsum_recall_stderr": 0.0023647123157988985}}, "1": {"tldr_en": {"bleu": 2.0614970786142583, "bleu_stderr": 0.09284907950143785, "rouge1_fmeasure": 0.19472791234999667, "rouge1_fmeasure_stderr": 0.001906191600167485, "rouge1_precision": 0.1668110350174811, "rouge1_precision_stderr": 0.0019877097402536763, "rouge1_recall": 0.28257427077519554, "rouge1_recall_stderr": 0.002760368419127019, "rouge2_fmeasure": 0.04208923831822898, "rouge2_fmeasure_stderr": 0.0009197645366087788, "rouge2_precision": 0.035928254592281224, "rouge2_precision_stderr": 0.0008382143224304085, "rouge2_recall": 0.06299590355552512, "rouge2_recall_stderr": 0.0015297554624304605, "rougeL_fmeasure": 0.14254660444247075, "rougeL_fmeasure_stderr": 0.0012952641600081254, "rougeL_precision": 0.12071476283575872, "rougeL_precision_stderr": 0.0013170745626523695, "rougeL_recall": 0.21195380139438252, "rougeL_recall_stderr": 0.002156914307318323, "rougeLsum_fmeasure": 0.18187224601097693, "rougeLsum_fmeasure_stderr": 0.0017770105186546616, "rougeLsum_precision": 0.15552475435698954, "rougeLsum_precision_stderr": 0.0018425071564360774, "rougeLsum_recall": 0.2646754231072801, "rougeLsum_recall_stderr": 0.0026025376230503126}}, "2": {"tldr_en": {"bleu": 2.292030395922892, "bleu_stderr": 0.06754694010620639, "rouge1_fmeasure": 0.20233033966420053, "rouge1_fmeasure_stderr": 0.001922238330347806, "rouge1_precision": 0.17503399476665835, "rouge1_precision_stderr": 0.0020658059154131355, "rouge1_recall": 0.29065133902349155, "rouge1_recall_stderr": 0.002706254805478106, "rouge2_fmeasure": 0.04596426564369226, "rouge2_fmeasure_stderr": 0.0009410781862317926, "rouge2_precision": 0.03957061154540559, "rouge2_precision_stderr": 0.0008663796206601736, "rouge2_recall": 0.06804676099365331, "rouge2_recall_stderr": 0.001558997607590439, "rougeL_fmeasure": 0.1484100309670736, "rougeL_fmeasure_stderr": 0.0013050413657904906, "rougeL_precision": 0.12692489719648392, "rougeL_precision_stderr": 0.0013802977601841946, "rougeL_recall": 0.2190474701025381, "rougeL_recall_stderr": 0.0021621371892660115, "rougeLsum_fmeasure": 0.1892405934401096, "rougeLsum_fmeasure_stderr": 0.0017922082822405196, "rougeLsum_precision": 0.1634222056607189, "rougeLsum_precision_stderr": 0.001920848983739493, "rougeLsum_recall": 0.27275533749258335, "rougeLsum_recall_stderr": 0.002561203631756384}}, "3": {"tldr_en": {"bleu": 2.233722192322054, "bleu_stderr": 0.05524157791896203, "rouge1_fmeasure": 0.1709872129969158, "rouge1_fmeasure_stderr": 0.002182801673316525, "rouge1_precision": 0.1528781021722799, "rouge1_precision_stderr": 0.0023528740048175136, "rouge1_recall": 0.2465107599485029, "rouge1_recall_stderr": 0.0032260989920230546, "rouge2_fmeasure": 0.038198357822552355, "rouge2_fmeasure_stderr": 0.0008958010274232348, "rouge2_precision": 0.033714456652552384, "rouge2_precision_stderr": 0.0008897021090186058, "rouge2_recall": 0.05775087421244321, "rouge2_recall_stderr": 0.0015593470864269968, "rougeL_fmeasure": 0.1256014046824487, "rougeL_fmeasure_stderr": 0.0015230285778693542, "rougeL_precision": 0.11164062299344275, "rougeL_precision_stderr": 0.0016829741874525947, "rougeL_recall": 0.18567352684120597, "rougeL_recall_stderr": 0.0025231022862244067, "rougeLsum_fmeasure": 0.160128027571052, "rougeLsum_fmeasure_stderr": 0.0020352820126400516, "rougeLsum_precision": 0.1430704524634149, "rougeLsum_precision_stderr": 0.0021969936921205774, "rougeLsum_recall": 0.2315193169307614, "rougeLsum_recall_stderr": 0.0030387376754444774}}, "4": {"tldr_en": {"bleu": 0.5079955725024277, "bleu_stderr": 0.035957194737455005, "rouge1_fmeasure": 0.05602015160131341, "rouge1_fmeasure_stderr": 0.0018664416661875064, "rouge1_precision": 0.052051218039705936, "rouge1_precision_stderr": 0.001963781996856516, "rouge1_recall": 0.08403225179407155, "rouge1_recall_stderr": 0.0028517043268622634, "rouge2_fmeasure": 0.012389475323072278, "rouge2_fmeasure_stderr": 0.0006098445157690174, "rouge2_precision": 0.011455358594622569, "rouge2_precision_stderr": 0.00073252531556729, "rouge2_recall": 0.020113630470345888, "rouge2_recall_stderr": 0.001131208867765504, "rougeL_fmeasure": 0.04231335237789037, "rougeL_fmeasure_stderr": 0.0013811919732020263, "rougeL_precision": 0.03935546213521773, "rougeL_precision_stderr": 0.001510763386700604, "rougeL_recall": 0.06538057294951942, "rougeL_recall_stderr": 0.002279377122252467, "rougeLsum_fmeasure": 0.05234967974089488, "rougeLsum_fmeasure_stderr": 0.0017422831492367827, "rougeLsum_precision": 0.048625295083917906, "rougeLsum_precision_stderr": 0.0018399091735182703, "rougeLsum_recall": 0.07874986853171788, "rougeLsum_recall_stderr": 0.0026850001210979844}}, "5": {"tldr_en": {"bleu": 9.095635249089997e-07, "bleu_stderr": 1.740964036496507e-06, "rouge1_fmeasure": 0.008988455531554258, "rouge1_fmeasure_stderr": 0.0008426740304726489, "rouge1_precision": 0.00833679063458924, "rouge1_precision_stderr": 0.0008338992451058071, "rouge1_recall": 0.014012820827305905, "rouge1_recall_stderr": 0.0013421148602955919, "rouge2_fmeasure": 0.0019437622260819584, "rouge2_fmeasure_stderr": 0.0002467959167648044, "rouge2_precision": 0.0016920491601071506, "rouge2_precision_stderr": 0.00022329552812013756, "rouge2_recall": 0.003270419901715466, "rouge2_recall_stderr": 0.0004597968144887323, "rougeL_fmeasure": 0.006734235793805549, "rougeL_fmeasure_stderr": 0.0006224347025618367, "rougeL_precision": 0.006219772423705417, "rougeL_precision_stderr": 0.0006090869904396185, "rougeL_recall": 0.010787022000549866, "rougeL_recall_stderr": 0.0010632210108394717, "rougeLsum_fmeasure": 0.00849274908984561, "rougeLsum_fmeasure_stderr": 0.0007902239379156704, "rougeLsum_precision": 0.007890145322072297, "rougeLsum_precision_stderr": 0.0007857621643905961, "rougeLsum_recall": 0.013284636696551826, "rougeLsum_recall_stderr": 0.0012729971468667274}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.027496856259569415, "bleu_stderr": 0.007385963274542756, "rouge1_fmeasure": 0.02786087179209838, "rouge1_fmeasure_stderr": 0.0005858506970554961, "rouge1_precision": 0.047807397913058214, "rouge1_precision_stderr": 0.0011364617294874694, "rouge1_recall": 0.022673512097835077, "rouge1_recall_stderr": 0.0005137246819540603, "rouge2_fmeasure": 0.0002573048402040541, "rouge2_fmeasure_stderr": 6.575032923202268e-05, "rouge2_precision": 0.0005045565387850128, "rouge2_precision_stderr": 0.00014549502538483447, "rouge2_recall": 0.00021403935275164058, "rouge2_recall_stderr": 5.468859488917244e-05, "rougeL_fmeasure": 0.02773258214801112, "rougeL_fmeasure_stderr": 0.0005799438210362486, "rougeL_precision": 0.04750467841033873, "rougeL_precision_stderr": 0.0011193107075921842, "rougeL_recall": 0.022556340907519876, "rougeL_recall_stderr": 0.000501594662926671, "rougeLsum_fmeasure": 0.027138243313024377, "rougeLsum_fmeasure_stderr": 0.0005613131933490365, "rougeLsum_precision": 0.04681714649780676, "rougeLsum_precision_stderr": 0.0011159994745820552, "rougeLsum_recall": 0.02201246000410449, "rougeLsum_recall_stderr": 0.00047810257232395407}}, "1": {"generate_text_restaurant": {"bleu": 5.314362512162757, "bleu_stderr": 0.0769982113030766, "rouge1_fmeasure": 0.3138272169710467, "rouge1_fmeasure_stderr": 0.0018338780001365314, "rouge1_precision": 0.25918480539023137, "rouge1_precision_stderr": 0.002016038150145768, "rouge1_recall": 0.45092555851546073, "rouge1_recall_stderr": 0.002775382584769768, "rouge2_fmeasure": 0.12479903412317217, "rouge2_fmeasure_stderr": 0.0013250592629703154, "rouge2_precision": 0.10262303268628689, "rouge2_precision_stderr": 0.0012494801287872546, "rouge2_recall": 0.18308739944422936, "rouge2_recall_stderr": 0.0020326480850756325, "rougeL_fmeasure": 0.2503482721276052, "rougeL_fmeasure_stderr": 0.0013578494077290393, "rougeL_precision": 0.20524097551911571, "rougeL_precision_stderr": 0.0014796154894487817, "rougeL_recall": 0.3644112328636815, "rougeL_recall_stderr": 0.0023556498368056318, "rougeLsum_fmeasure": 0.2572370513641328, "rougeLsum_fmeasure_stderr": 0.0017528475888098256, "rougeLsum_precision": 0.21324966168952111, "rougeLsum_precision_stderr": 0.0018757528611749424, "rougeLsum_recall": 0.3686148255109357, "rougeLsum_recall_stderr": 0.0025955691777193217}}, "2": {"generate_text_restaurant": {"bleu": 6.325830512149402, "bleu_stderr": 0.09465026494214827, "rouge1_fmeasure": 0.35038288574502346, "rouge1_fmeasure_stderr": 0.0017512378126847482, "rouge1_precision": 0.29476393089554137, "rouge1_precision_stderr": 0.0018788751048144279, "rouge1_recall": 0.47796061164305575, "rouge1_recall_stderr": 0.002752937100665692, "rouge2_fmeasure": 0.14648456630581705, "rouge2_fmeasure_stderr": 0.001382565495597909, "rouge2_precision": 0.1225316105353968, "rouge2_precision_stderr": 0.0012759941001301229, "rouge2_recall": 0.20377380172237158, "rouge2_recall_stderr": 0.0021030901186470078, "rougeL_fmeasure": 0.25470473636536506, "rougeL_fmeasure_stderr": 0.0013682076501230478, "rougeL_precision": 0.2139893115508532, "rougeL_precision_stderr": 0.0014438796540664535, "rougeL_recall": 0.3497428021452183, "rougeL_recall_stderr": 0.002279119692449497, "rougeLsum_fmeasure": 0.2954049051400817, "rougeLsum_fmeasure_stderr": 0.0016940279750374688, "rougeLsum_precision": 0.24914819370477054, "rougeLsum_precision_stderr": 0.001781429279282853, "rougeLsum_recall": 0.40204602010365786, "rougeLsum_recall_stderr": 0.002550588410355925}}, "3": {"generate_text_restaurant": {"bleu": 6.220755100721931, "bleu_stderr": 0.08670753957301724, "rouge1_fmeasure": 0.35031384994100806, "rouge1_fmeasure_stderr": 0.0016629497950809106, "rouge1_precision": 0.28326448174434227, "rouge1_precision_stderr": 0.0015904147315234246, "rouge1_recall": 0.4912377845146275, "rouge1_recall_stderr": 0.0026742890310022037, "rouge2_fmeasure": 0.14718125822293368, "rouge2_fmeasure_stderr": 0.0013638027262871243, "rouge2_precision": 0.11808337262877049, "rouge2_precision_stderr": 0.0011828407846607325, "rouge2_recall": 0.211495428182817, "rouge2_recall_stderr": 0.0021550210917977604, "rougeL_fmeasure": 0.24600845724972414, "rougeL_fmeasure_stderr": 0.0013072908419869695, "rougeL_precision": 0.19847946216112136, "rougeL_precision_stderr": 0.001208194793948507, "rougeL_recall": 0.3473816584720071, "rougeL_recall_stderr": 0.0022369392784332546, "rougeLsum_fmeasure": 0.2961838000715837, "rougeLsum_fmeasure_stderr": 0.0016190692110987968, "rougeLsum_precision": 0.23966070087085453, "rougeLsum_precision_stderr": 0.0015055005166558567, "rougeLsum_recall": 0.4149981951143496, "rougeLsum_recall_stderr": 0.0025291151450575807}}, "4": {"generate_text_restaurant": {"bleu": 6.389458547015763, "bleu_stderr": 0.07567250209055609, "rouge1_fmeasure": 0.3515156760106122, "rouge1_fmeasure_stderr": 0.0016532451325087115, "rouge1_precision": 0.28593161432431424, "rouge1_precision_stderr": 0.0015919612522573242, "rouge1_recall": 0.4874525990823487, "rouge1_recall_stderr": 0.002594781648430376, "rouge2_fmeasure": 0.14861120179755016, "rouge2_fmeasure_stderr": 0.0013885860121316607, "rouge2_precision": 0.11974496863366363, "rouge2_precision_stderr": 0.0011676406530046445, "rouge2_recall": 0.21088419111689036, "rouge2_recall_stderr": 0.002150241945207948, "rougeL_fmeasure": 0.24771148695595446, "rougeL_fmeasure_stderr": 0.0013410031747652237, "rougeL_precision": 0.20089041282335124, "rougeL_precision_stderr": 0.001212814806634473, "rougeL_recall": 0.34593464970921395, "rougeL_recall_stderr": 0.002240164116339641, "rougeLsum_fmeasure": 0.2976927202262742, "rougeLsum_fmeasure_stderr": 0.0016328209282317906, "rougeLsum_precision": 0.24209629823192363, "rougeLsum_precision_stderr": 0.0015046779900312415, "rougeLsum_recall": 0.41292727791904127, "rougeLsum_recall_stderr": 0.002515568479286581}}, "5": {"generate_text_restaurant": {"bleu": 6.5778109853818805, "bleu_stderr": 0.07853639654126673, "rouge1_fmeasure": 0.3522694213042175, "rouge1_fmeasure_stderr": 0.0016367915675506936, "rouge1_precision": 0.2903438083004083, "rouge1_precision_stderr": 0.0016258609800168166, "rouge1_recall": 0.4798034781155066, "rouge1_recall_stderr": 0.002558773195235382, "rouge2_fmeasure": 0.15043417065549097, "rouge2_fmeasure_stderr": 0.0013696094059916363, "rouge2_precision": 0.12304693214365679, "rouge2_precision_stderr": 0.0011814359326877534, "rouge2_recall": 0.20889730658325914, "rouge2_recall_stderr": 0.002078829798291329, "rougeL_fmeasure": 0.2515843186162369, "rougeL_fmeasure_stderr": 0.001348337199469822, "rougeL_precision": 0.20668270832400937, "rougeL_precision_stderr": 0.0012420561784777045, "rougeL_recall": 0.3449859636097895, "rougeL_recall_stderr": 0.002236880429413282, "rougeLsum_fmeasure": 0.2995796499530265, "rougeLsum_fmeasure_stderr": 0.0016286619231701276, "rougeLsum_precision": 0.24693348710209848, "rougeLsum_precision_stderr": 0.0015466223462410505, "rougeLsum_recall": 0.4079639743417248, "rougeLsum_recall_stderr": 0.0024745366963463723}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.672427500415346, "bleu_stderr": 0.06224047870202198, "rouge1_fmeasure": 0.20461196621127212, "rouge1_fmeasure_stderr": 0.002456416561114147, "rouge1_precision": 0.15446349983996752, "rouge1_precision_stderr": 0.002056543674678123, "rouge1_recall": 0.33511082297031264, "rouge1_recall_stderr": 0.0042735988519539215, "rouge2_fmeasure": 0.04183742358897129, "rouge2_fmeasure_stderr": 0.0014163052777246905, "rouge2_precision": 0.030568062524863993, "rouge2_precision_stderr": 0.00106051192463649, "rouge2_recall": 0.07227511022307367, "rouge2_recall_stderr": 0.0025089135207627426, "rougeL_fmeasure": 0.15065663850491834, "rougeL_fmeasure_stderr": 0.0017993673274426995, "rougeL_precision": 0.11362014484428674, "rougeL_precision_stderr": 0.001502727857377175, "rougeL_recall": 0.24803625345806893, "rougeL_recall_stderr": 0.003251203569494646, "rougeLsum_fmeasure": 0.1597417527650389, "rougeLsum_fmeasure_stderr": 0.002049977023045275, "rougeLsum_precision": 0.12014955371512309, "rougeLsum_precision_stderr": 0.0016511000633980721, "rougeLsum_recall": 0.26350730851862786, "rougeLsum_recall_stderr": 0.0037010867344718494}}, "1": {"article_DOC_summary": {"bleu": 1.275721416879001, "bleu_stderr": 0.06517346432102149, "rouge1_fmeasure": 0.17002150605190583, "rouge1_fmeasure_stderr": 0.002413215946011016, "rouge1_precision": 0.12085626806737927, "rouge1_precision_stderr": 0.0017891193548989438, "rouge1_recall": 0.2987160321435191, "rouge1_recall_stderr": 0.00413368541077085, "rouge2_fmeasure": 0.032840477830095596, "rouge2_fmeasure_stderr": 0.0013243307135319853, "rouge2_precision": 0.023063271052401108, "rouge2_precision_stderr": 0.0009280012800721673, "rouge2_recall": 0.05940351775728508, "rouge2_recall_stderr": 0.0024837332976990266, "rougeL_fmeasure": 0.13101534493327607, "rougeL_fmeasure_stderr": 0.0017667403668890078, "rougeL_precision": 0.09287906478368882, "rougeL_precision_stderr": 0.0012943640278186897, "rougeL_recall": 0.23214124490959148, "rougeL_recall_stderr": 0.00319891767484422, "rougeLsum_fmeasure": 0.13530824264400498, "rougeLsum_fmeasure_stderr": 0.0019563437490256476, "rougeLsum_precision": 0.09594077364128303, "rougeLsum_precision_stderr": 0.001428271472982218, "rougeLsum_recall": 0.2393756506317347, "rougeLsum_recall_stderr": 0.003497762504180999}}, "2": {"article_DOC_summary": {"bleu": 1.415586441135803, "bleu_stderr": 0.08272312685335785, "rouge1_fmeasure": 0.17593641988573824, "rouge1_fmeasure_stderr": 0.00234463310312545, "rouge1_precision": 0.12470672450008642, "rouge1_precision_stderr": 0.001730877605157259, "rouge1_recall": 0.31075289796152855, "rouge1_recall_stderr": 0.004103173454358926, "rouge2_fmeasure": 0.03581736634962746, "rouge2_fmeasure_stderr": 0.0013556261837556886, "rouge2_precision": 0.025120528884661486, "rouge2_precision_stderr": 0.0009506484753053252, "rouge2_recall": 0.06507926184404549, "rouge2_recall_stderr": 0.0025723662373737556, "rougeL_fmeasure": 0.13731969194870292, "rougeL_fmeasure_stderr": 0.0017728970309222165, "rougeL_precision": 0.097210056441488, "rougeL_precision_stderr": 0.00130179992393822, "rougeL_recall": 0.24368451935612578, "rougeL_recall_stderr": 0.0032187047798505693, "rougeLsum_fmeasure": 0.14160043868370525, "rougeLsum_fmeasure_stderr": 0.001972243082446741, "rougeLsum_precision": 0.10015575582491773, "rougeLsum_precision_stderr": 0.0014343870962585403, "rougeLsum_recall": 0.25163047768424157, "rougeLsum_recall_stderr": 0.0035877808840131804}}, "3": {"article_DOC_summary": {"bleu": 1.3855685926696633, "bleu_stderr": 0.09518631321191244, "rouge1_fmeasure": 0.17024859520096114, "rouge1_fmeasure_stderr": 0.002562946794229022, "rouge1_precision": 0.12396012732837984, "rouge1_precision_stderr": 0.0020737206796275333, "rouge1_recall": 0.294082787536851, "rouge1_recall_stderr": 0.004394659694676491, "rouge2_fmeasure": 0.03333710891779061, "rouge2_fmeasure_stderr": 0.0013557965156104658, "rouge2_precision": 0.02383270351288902, "rouge2_precision_stderr": 0.0009962269501358127, "rouge2_recall": 0.0597937637900738, "rouge2_recall_stderr": 0.0025128359808654207, "rougeL_fmeasure": 0.13130207255582982, "rougeL_fmeasure_stderr": 0.0018962132114596106, "rougeL_precision": 0.0951854301888063, "rougeL_precision_stderr": 0.001504553701052666, "rougeL_recall": 0.2288027311191167, "rougeL_recall_stderr": 0.0034070040841674277, "rougeLsum_fmeasure": 0.13553160155677008, "rougeLsum_fmeasure_stderr": 0.002110243338807585, "rougeLsum_precision": 0.09840275750442219, "rougeLsum_precision_stderr": 0.0017177817559235623, "rougeLsum_recall": 0.23599364564166173, "rougeLsum_recall_stderr": 0.00374171509560645}}, "4": {"article_DOC_summary": {"bleu": 0.6328883165870776, "bleu_stderr": 0.13295917920386843, "rouge1_fmeasure": 0.04511319204333184, "rouge1_fmeasure_stderr": 0.00253231999079197, "rouge1_precision": 0.0385226096298928, "rouge1_precision_stderr": 0.002445729090310371, "rouge1_recall": 0.07058918571425392, "rouge1_recall_stderr": 0.004055809965880756, "rouge2_fmeasure": 0.008364409687682841, "rouge2_fmeasure_stderr": 0.000821942672548505, "rouge2_precision": 0.006582553276982632, "rouge2_precision_stderr": 0.0006957960840224698, "rouge2_recall": 0.013801684775810714, "rouge2_recall_stderr": 0.0013847950972590455, "rougeL_fmeasure": 0.035199023564244016, "rougeL_fmeasure_stderr": 0.0019664417280659367, "rougeL_precision": 0.030493320960123818, "rougeL_precision_stderr": 0.0020172488216199987, "rougeL_recall": 0.05525735538867771, "rougeL_recall_stderr": 0.0031830815732572424, "rougeLsum_fmeasure": 0.037049573469593644, "rougeLsum_fmeasure_stderr": 0.0020924709342865803, "rougeLsum_precision": 0.032113373783877935, "rougeLsum_precision_stderr": 0.0021307041062483545, "rougeLsum_recall": 0.058070454031511114, "rougeLsum_recall_stderr": 0.0033830316361808826}}, "5": {"article_DOC_summary": {"bleu": 4.665459965858088e-38, "bleu_stderr": 5.5026833855786054e-33, "rouge1_fmeasure": 0.0018318259362959952, "rouge1_fmeasure_stderr": 0.0004972909761861116, "rouge1_precision": 0.002002107077973075, "rouge1_precision_stderr": 0.0005578257118175824, "rouge1_recall": 0.001762094811101669, "rouge1_recall_stderr": 0.00047783830113039527, "rouge2_fmeasure": 0.00017753481649226955, "rouge2_fmeasure_stderr": 8.909774211763047e-05, "rouge2_precision": 0.00020887285731902822, "rouge2_precision_stderr": 0.00010454399623682942, "rouge2_recall": 0.00016020143378633943, "rouge2_recall_stderr": 8.24302746259628e-05, "rougeL_fmeasure": 0.0015100528537815832, "rougeL_fmeasure_stderr": 0.00041465009328220766, "rougeL_precision": 0.0016875974156627233, "rougeL_precision_stderr": 0.00048448535968082044, "rougeL_recall": 0.001424633735571781, "rougeL_recall_stderr": 0.00038296569125613873, "rougeLsum_fmeasure": 0.0016080680461363983, "rougeLsum_fmeasure_stderr": 0.0004477924018194676, "rougeLsum_precision": 0.0017778745665158427, "rougeLsum_precision_stderr": 0.0005088500755445573, "rougeLsum_recall": 0.00153183785220986, "rougeLsum_recall_stderr": 0.0004253215571278959}}}}
2b855b9bc4seed3/evaluation/generation/merged.csv CHANGED
@@ -27,9 +27,27 @@ gem_xsum,5,median,rouge2_fmeasure,0.0002533740120631878
27
  gem_xsum,5,average,multiple,0.023467846396737577
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0493613190828951
29
  web_nlg_en,0,median,rouge2_fmeasure,0.0493613190828951
30
- web_nlg_en,0,average,multiple,0.0493613190828951
 
 
 
 
 
 
 
 
 
 
31
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03364466039752911
32
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03364466039752911
33
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04146537149742157
34
  wiki_lingua_en,1,median,rouge2_fmeasure,0.04146537149742157
35
- wiki_lingua_en,1,average,multiple,0.03755501594747534
 
 
 
 
 
 
 
 
 
27
  gem_xsum,5,average,multiple,0.023467846396737577
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0493613190828951
29
  web_nlg_en,0,median,rouge2_fmeasure,0.0493613190828951
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05246738310334436
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05246738310334436
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05603019582407278
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.05603019582407278
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05577366815646396
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05577366815646396
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05615553678250917
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05615553678250917
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.056241483469904825
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.056241483469904825
40
+ web_nlg_en,5,average,multiple,0.05433826440319837
41
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03364466039752911
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03364466039752911
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04146537149742157
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.04146537149742157
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04476923922576134
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.04476923922576134
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03767208670961266
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.03767208670961266
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012154113417992855
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.012154113417992855
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020072557460750727
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0020072557460750727
53
+ wiki_lingua_en,5,average,multiple,0.028618787832398768
2b855b9bc4seed3/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3640781107267224, "bleu_stderr": 0.035526130232376905, "rouge1_fmeasure": 0.10446452400904378, "rouge1_fmeasure_stderr": 0.0021282046267782563, "rouge1_precision": 0.07014102967481332, "rouge1_precision_stderr": 0.0017603361253258872, "rouge1_recall": 0.28842029546681935, "rouge1_recall_stderr": 0.0049233552951936715, "rouge2_fmeasure": 0.0493613190828951, "rouge2_fmeasure_stderr": 0.0012678462201386074, "rouge2_precision": 0.0330408702642744, "rouge2_precision_stderr": 0.0010389544177208019, "rouge2_recall": 0.14098272377215726, "rouge2_recall_stderr": 0.003297340089435132, "rougeL_fmeasure": 0.10036321845998371, "rougeL_fmeasure_stderr": 0.0019766166905939206, "rougeL_precision": 0.06704364750637273, "rougeL_precision_stderr": 0.0016022359995174224, "rougeL_recall": 0.2800801497048708, "rougeL_recall_stderr": 0.004801675913591644, "rougeLsum_fmeasure": 0.09950407096550104, "rougeLsum_fmeasure_stderr": 0.001998952718407562, "rougeLsum_precision": 0.06676887269215705, "rougeLsum_precision_stderr": 0.0016525447643874539, "rougeLsum_recall": 0.2753731807713974, "rougeLsum_recall_stderr": 0.004658503872997521}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4785870749297974, "bleu_stderr": 0.058287705839301554, "rouge1_fmeasure": 0.17553793244289345, "rouge1_fmeasure_stderr": 0.001806401016559939, "rouge1_precision": 0.15044351613730478, "rouge1_precision_stderr": 0.001873937661358226, "rouge1_recall": 0.2541675583808366, "rouge1_recall_stderr": 0.002525767681285887, "rouge2_fmeasure": 0.03364466039752911, "rouge2_fmeasure_stderr": 0.0008133761132820285, "rouge2_precision": 0.028754049926889878, "rouge2_precision_stderr": 0.0007284335979063377, "rouge2_recall": 0.0498554560379571, "rouge2_recall_stderr": 0.0012994427687278028, "rougeL_fmeasure": 0.1387301222348297, "rougeL_fmeasure_stderr": 0.0012940665219750957, "rougeL_precision": 0.11754314152860335, "rougeL_precision_stderr": 0.0013223617006764188, "rougeL_recall": 0.20544370534958584, "rougeL_recall_stderr": 0.0020272468710765853, "rougeLsum_fmeasure": 0.16126585981961294, "rougeLsum_fmeasure_stderr": 0.0016467066936214445, "rougeLsum_precision": 0.13796688806438603, "rougeLsum_precision_stderr": 0.0017060133557980854, "rougeLsum_recall": 0.23453206833843981, "rougeLsum_recall_stderr": 0.002342224647799962}}, "1": {"tldr_en": {"bleu": 2.0467243985974637, "bleu_stderr": 0.07716877063896437, "rouge1_fmeasure": 0.1921501387436706, "rouge1_fmeasure_stderr": 0.00188282133972654, "rouge1_precision": 0.16583754031020287, "rouge1_precision_stderr": 0.0020211759897587646, "rouge1_recall": 0.2784359478170013, "rouge1_recall_stderr": 0.002692298233698752, "rouge2_fmeasure": 0.04146537149742157, "rouge2_fmeasure_stderr": 0.0008844838235072672, "rouge2_precision": 0.035939445160926, "rouge2_precision_stderr": 0.0008394970960275082, "rouge2_recall": 0.06220805715536351, "rouge2_recall_stderr": 0.0015155473184693795, "rougeL_fmeasure": 0.14062569600905112, "rougeL_fmeasure_stderr": 0.0012672119441252353, "rougeL_precision": 0.12014085192612638, "rougeL_precision_stderr": 0.0013614505956239505, "rougeL_recall": 0.2086744134191942, "rougeL_recall_stderr": 0.0020828906859784203, "rougeLsum_fmeasure": 0.179434656949378, "rougeLsum_fmeasure_stderr": 0.0017505927676760471, "rougeLsum_precision": 0.15464361675465313, "rougeLsum_precision_stderr": 0.001877438093158986, "rougeLsum_recall": 0.26102453302903145, "rougeLsum_recall_stderr": 0.0025492852995640664}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.0956694980647792, "bleu_stderr": 0.04999557005701438, "rouge1_fmeasure": 0.09170375521862002, "rouge1_fmeasure_stderr": 0.0016027220200337815, "rouge1_precision": 0.09503245055218068, "rouge1_precision_stderr": 0.002227002440189141, "rouge1_recall": 0.12519204548417082, "rouge1_recall_stderr": 0.002257706262720428, "rouge2_fmeasure": 0.0200914197255096, "rouge2_fmeasure_stderr": 0.0007228266926632685, "rouge2_precision": 0.015747413529649493, "rouge2_precision_stderr": 0.0005857234957460363, "rouge2_recall": 0.03085419325647102, "rouge2_recall_stderr": 0.0011016936972703011, "rougeL_fmeasure": 0.08870793127413534, "rougeL_fmeasure_stderr": 0.0015148695280122193, "rougeL_precision": 0.09097585318147142, "rougeL_precision_stderr": 0.0020889683329724184, "rougeL_recall": 0.12191519471001734, "rougeL_recall_stderr": 0.002171780673661775, "rougeLsum_fmeasure": 0.0784374812184608, "rougeLsum_fmeasure_stderr": 0.0014050281578343764, "rougeLsum_precision": 0.0834425343644634, "rougeLsum_precision_stderr": 0.0020931903981870545, "rougeLsum_recall": 0.10608182715305749, "rougeLsum_recall_stderr": 0.0019274028911518155}}, "1": {"generate_text_restaurant": {"bleu": 10.151239164071983, "bleu_stderr": 0.1257334156643682, "rouge1_fmeasure": 0.40675497527393745, "rouge1_fmeasure_stderr": 0.002120685128829797, "rouge1_precision": 0.4698413043094089, "rouge1_precision_stderr": 0.0030273225025413382, "rouge1_recall": 0.40584950166038797, "rouge1_recall_stderr": 0.0029229476256733026, "rouge2_fmeasure": 0.17669097143978887, "rouge2_fmeasure_stderr": 0.001716223261618654, "rouge2_precision": 0.2074578784589061, "rouge2_precision_stderr": 0.0023047939137167727, "rouge2_recall": 0.17636894739489137, "rouge2_recall_stderr": 0.001969048658306312, "rougeL_fmeasure": 0.2915979204699439, "rougeL_fmeasure_stderr": 0.0017827978723647846, "rougeL_precision": 0.3404061024475899, "rougeL_precision_stderr": 0.0026532569303317583, "rougeL_recall": 0.2896879499504477, "rougeL_recall_stderr": 0.0022806434671987395, "rougeLsum_fmeasure": 0.33318221999557907, "rougeLsum_fmeasure_stderr": 0.0020291648036479103, "rougeLsum_precision": 0.3852954389535586, "rougeLsum_precision_stderr": 0.0028137809730615915, "rougeLsum_recall": 0.3327977299709147, "rougeLsum_recall_stderr": 0.002657870623602599}}, "2": {"generate_text_restaurant": {"bleu": 10.477786265685717, "bleu_stderr": 0.18647473053208208, "rouge1_fmeasure": 0.4181182679230384, "rouge1_fmeasure_stderr": 0.0020452985763851753, "rouge1_precision": 0.4750249856912901, "rouge1_precision_stderr": 0.003117749277972381, "rouge1_recall": 0.420167230887751, "rouge1_recall_stderr": 0.00277185209174179, "rouge2_fmeasure": 0.19044056117286426, "rouge2_fmeasure_stderr": 0.00168575182239935, "rouge2_precision": 0.22050994133149254, "rouge2_precision_stderr": 0.0023245851449860567, "rouge2_recall": 0.19130121956469073, "rouge2_recall_stderr": 0.001942626358068861, "rougeL_fmeasure": 0.2982970663330926, "rougeL_fmeasure_stderr": 0.0017726787771058774, "rougeL_precision": 0.3406569558070198, "rougeL_precision_stderr": 0.0026479989912893075, "rougeL_recall": 0.299702139963536, "rougeL_recall_stderr": 0.0022640373889766096, "rougeLsum_fmeasure": 0.34419023746163807, "rougeLsum_fmeasure_stderr": 0.0019847110018846874, "rougeLsum_precision": 0.3901601617973346, "rougeLsum_precision_stderr": 0.002815160547643611, "rougeLsum_recall": 0.3471499862247708, "rougeLsum_recall_stderr": 0.0026065622780937396}}, "3": {"generate_text_restaurant": {"bleu": 10.568317116179442, "bleu_stderr": 0.11556927070417884, "rouge1_fmeasure": 0.41842021335444884, "rouge1_fmeasure_stderr": 0.0020188593584297234, "rouge1_precision": 0.47159606393712117, "rouge1_precision_stderr": 0.003129868683115526, "rouge1_recall": 0.4234823828365637, "rouge1_recall_stderr": 0.0027467329857418952, "rouge2_fmeasure": 0.1938496862679729, "rouge2_fmeasure_stderr": 0.0016754810953723432, "rouge2_precision": 0.22231457931781198, "rouge2_precision_stderr": 0.0023199388284090745, "rouge2_recall": 0.19702786066961364, "rouge2_recall_stderr": 0.001990057035725131, "rougeL_fmeasure": 0.29696016514568546, "rougeL_fmeasure_stderr": 0.0017603697168916232, "rougeL_precision": 0.33656147022200217, "rougeL_precision_stderr": 0.002667737808938232, "rougeL_recall": 0.30066480270962387, "rougeL_recall_stderr": 0.0022622576572450003, "rougeLsum_fmeasure": 0.3438826400107135, "rougeLsum_fmeasure_stderr": 0.0019856692684705, "rougeLsum_precision": 0.38705644076443046, "rougeLsum_precision_stderr": 0.0028690431456947075, "rougeLsum_recall": 0.349206081387889, "rougeLsum_recall_stderr": 0.002597394859544418}}, "4": {"generate_text_restaurant": {"bleu": 10.581390958595872, "bleu_stderr": 0.16287225143334053, "rouge1_fmeasure": 0.4202226303244981, "rouge1_fmeasure_stderr": 0.0019805005077845462, "rouge1_precision": 0.46703203534176785, "rouge1_precision_stderr": 0.003027608948766856, "rouge1_recall": 0.42673885443033144, "rouge1_recall_stderr": 0.002653056579198862, "rouge2_fmeasure": 0.19550059496111732, "rouge2_fmeasure_stderr": 0.001681137850682549, "rouge2_precision": 0.221024427493211, "rouge2_precision_stderr": 0.0022668760796123264, "rouge2_recall": 0.19893689501234513, "rouge2_recall_stderr": 0.0019645186911920118, "rougeL_fmeasure": 0.2960631555841986, "rougeL_fmeasure_stderr": 0.00175639502051002, "rougeL_precision": 0.3305021376136181, "rougeL_precision_stderr": 0.0025711911253961032, "rougeL_recall": 0.30080598135342596, "rougeL_recall_stderr": 0.002216619700518367, "rougeLsum_fmeasure": 0.34711141302754794, "rougeLsum_fmeasure_stderr": 0.0019655722013055365, "rougeLsum_precision": 0.3848753584944118, "rougeLsum_precision_stderr": 0.0027594334460907116, "rougeLsum_recall": 0.35379992432844626, "rougeLsum_recall_stderr": 0.002554582589973131}}, "5": {"generate_text_restaurant": {"bleu": 10.161698634389158, "bleu_stderr": 0.13251687133818035, "rouge1_fmeasure": 0.41690846563222644, "rouge1_fmeasure_stderr": 0.0019329282633560236, "rouge1_precision": 0.45350802506846094, "rouge1_precision_stderr": 0.0029332747892263492, "rouge1_recall": 0.4298932002561076, "rouge1_recall_stderr": 0.0025654012473086065, "rouge2_fmeasure": 0.19180440349615147, "rouge2_fmeasure_stderr": 0.0016038971120681744, "rouge2_precision": 0.21155169806401755, "rouge2_precision_stderr": 0.0021213554539015983, "rouge2_recall": 0.19815792477697378, "rouge2_recall_stderr": 0.0018697079815536174, "rougeL_fmeasure": 0.2943936910199933, "rougeL_fmeasure_stderr": 0.001703424504126945, "rougeL_precision": 0.3208681338440674, "rougeL_precision_stderr": 0.0024465770749067034, "rougeL_recall": 0.3044498221521836, "rougeL_recall_stderr": 0.0021841272279665724, "rougeLsum_fmeasure": 0.3474979202384499, "rougeLsum_fmeasure_stderr": 0.0019204154404832128, "rougeLsum_precision": 0.37693689810608394, "rougeLsum_precision_stderr": 0.0026776647538697042, "rougeLsum_recall": 0.3598773179164774, "rougeLsum_recall_stderr": 0.002490089475383021}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8522377895450308, "bleu_stderr": 0.07723955076500588, "rouge1_fmeasure": 0.20202631280258004, "rouge1_fmeasure_stderr": 0.0024569661256867497, "rouge1_precision": 0.1530345030359518, "rouge1_precision_stderr": 0.002115037282090291, "rouge1_recall": 0.3297522507077294, "rouge1_recall_stderr": 0.004233264868359823, "rouge2_fmeasure": 0.0438628207106013, "rouge2_fmeasure_stderr": 0.0015033748211488044, "rouge2_precision": 0.03260120789118233, "rouge2_precision_stderr": 0.0011626221940092174, "rouge2_recall": 0.07472054203573325, "rouge2_recall_stderr": 0.002669966683230418, "rougeL_fmeasure": 0.15285396489063718, "rougeL_fmeasure_stderr": 0.0018457457908837322, "rougeL_precision": 0.1154609949889405, "rougeL_precision_stderr": 0.001570276581336417, "rougeL_recall": 0.2514593427654629, "rougeL_recall_stderr": 0.003335220546206934, "rougeLsum_fmeasure": 0.15642992761683933, "rougeLsum_fmeasure_stderr": 0.0020624169809871596, "rougeLsum_precision": 0.11796967069833372, "rougeLsum_precision_stderr": 0.0016923604745038886, "rougeLsum_recall": 0.2574959975250046, "rougeLsum_recall_stderr": 0.0037084065360991183}}, "1": {"article_DOC_summary": {"bleu": 1.2604705578203963, "bleu_stderr": 0.06124796466601407, "rouge1_fmeasure": 0.16964759605780336, "rouge1_fmeasure_stderr": 0.002326270519236544, "rouge1_precision": 0.12043203461580464, "rouge1_precision_stderr": 0.0017443586213575642, "rouge1_recall": 0.2990330815832083, "rouge1_recall_stderr": 0.00392004786403155, "rouge2_fmeasure": 0.03145088602415999, "rouge2_fmeasure_stderr": 0.001268842216195083, "rouge2_precision": 0.022185940713628696, "rouge2_precision_stderr": 0.0009035076589227027, "rouge2_recall": 0.05647340976722095, "rouge2_recall_stderr": 0.0022888090744372525, "rougeL_fmeasure": 0.13279585103698274, "rougeL_fmeasure_stderr": 0.0017386360127803384, "rougeL_precision": 0.0940358491286338, "rougeL_precision_stderr": 0.0012896725062631583, "rougeL_recall": 0.23588081649114553, "rougeL_recall_stderr": 0.003076131089195475, "rougeLsum_fmeasure": 0.13714896670693033, "rougeLsum_fmeasure_stderr": 0.001891141826576318, "rougeLsum_precision": 0.09715832932274611, "rougeLsum_precision_stderr": 0.0014027160695998882, "rougeLsum_recall": 0.24318297005771927, "rougeLsum_recall_stderr": 0.0032866076983934975}}, "2": {"article_DOC_summary": {"bleu": 1.1993548757142343, "bleu_stderr": 0.07506740455601302, "rouge1_fmeasure": 0.16618737976479214, "rouge1_fmeasure_stderr": 0.002276816920530847, "rouge1_precision": 0.11769325953014931, "rouge1_precision_stderr": 0.0017015278474607515, "rouge1_recall": 0.29438013695551873, "rouge1_recall_stderr": 0.0038789493089544562, "rouge2_fmeasure": 0.029880831851623216, "rouge2_fmeasure_stderr": 0.00122944703122044, "rouge2_precision": 0.02096426606557719, "rouge2_precision_stderr": 0.0008665435069084103, "rouge2_recall": 0.05433482039565291, "rouge2_recall_stderr": 0.0023105213250519683, "rougeL_fmeasure": 0.12975792447787388, "rougeL_fmeasure_stderr": 0.0017033041521276769, "rougeL_precision": 0.09164613863479107, "rougeL_precision_stderr": 0.0012536491969391975, "rougeL_recall": 0.23162945469437568, "rougeL_recall_stderr": 0.003070926511833877, "rougeLsum_fmeasure": 0.1335903464477578, "rougeLsum_fmeasure_stderr": 0.0018357495291236787, "rougeLsum_precision": 0.09433314094899128, "rougeLsum_precision_stderr": 0.001348427130905447, "rougeLsum_recall": 0.2385512346805842, "rougeLsum_recall_stderr": 0.0032940278213274467}}, "3": {"article_DOC_summary": {"bleu": 1.2211825031010066, "bleu_stderr": 0.09154640955268176, "rouge1_fmeasure": 0.15920600367020823, "rouge1_fmeasure_stderr": 0.002481979093247659, "rouge1_precision": 0.11565156919649207, "rouge1_precision_stderr": 0.0019801993699536105, "rouge1_recall": 0.2756237703410958, "rouge1_recall_stderr": 0.004182876892512918, "rouge2_fmeasure": 0.02850791752436153, "rouge2_fmeasure_stderr": 0.0012936908884457888, "rouge2_precision": 0.020336317588370855, "rouge2_precision_stderr": 0.0009274120929811008, "rouge2_recall": 0.05057209548284876, "rouge2_recall_stderr": 0.002351218721951043, "rougeL_fmeasure": 0.12499127552831453, "rougeL_fmeasure_stderr": 0.0018804015974190902, "rougeL_precision": 0.09058001935560847, "rougeL_precision_stderr": 0.0014865597899847306, "rougeL_recall": 0.21793077385082366, "rougeL_recall_stderr": 0.003303059349423484, "rougeLsum_fmeasure": 0.1277963893933807, "rougeLsum_fmeasure_stderr": 0.0020055497561999075, "rougeLsum_precision": 0.09255803892135304, "rougeLsum_precision_stderr": 0.0015654263100926082, "rougeLsum_recall": 0.2227941148100558, "rougeLsum_recall_stderr": 0.0035075623884690582}}, "4": {"article_DOC_summary": {"bleu": 0.6391086739448858, "bleu_stderr": 0.1700476603076541, "rouge1_fmeasure": 0.0422631600216737, "rouge1_fmeasure_stderr": 0.0023710097459347006, "rouge1_precision": 0.03604653885002255, "rouge1_precision_stderr": 0.002296117864033233, "rouge1_recall": 0.06660229202514774, "rouge1_recall_stderr": 0.0038110111174951616, "rouge2_fmeasure": 0.006851248257616251, "rouge2_fmeasure_stderr": 0.0007463712089541968, "rouge2_precision": 0.0059458501046395435, "rouge2_precision_stderr": 0.0008340086797958044, "rouge2_recall": 0.011405807073573361, "rouge2_recall_stderr": 0.001256767237688611, "rougeL_fmeasure": 0.03322511331159771, "rougeL_fmeasure_stderr": 0.001836464148903972, "rougeL_precision": 0.029041397090136103, "rougeL_precision_stderr": 0.0019748578000551753, "rougeL_recall": 0.0526320135913171, "rougeL_recall_stderr": 0.0030189987868325676, "rougeLsum_fmeasure": 0.035345564934779486, "rougeLsum_fmeasure_stderr": 0.0019873313312401414, "rougeLsum_precision": 0.03074792573874264, "rougeLsum_precision_stderr": 0.0020676689105202572, "rougeLsum_recall": 0.055745471113257235, "rougeLsum_recall_stderr": 0.0032243196958437617}}, "5": {"article_DOC_summary": {"bleu": 1.0632885974270816e-38, "bleu_stderr": 2.8992106826594798e-33, "rouge1_fmeasure": 0.002353715811942938, "rouge1_fmeasure_stderr": 0.0006677511768602319, "rouge1_precision": 0.002662477701177392, "rouge1_precision_stderr": 0.0007829468225788233, "rouge1_recall": 0.0021639115396153217, "rouge1_recall_stderr": 0.0006001713237417921, "rouge2_fmeasure": 0.0002533740120631878, "rouge2_fmeasure_stderr": 0.00011551517378567248, "rouge2_precision": 0.000298209621184094, "rouge2_precision_stderr": 0.00013919588909238172, "rouge2_recall": 0.00022427781861744122, "rouge2_recall_stderr": 0.00010143261738415956, "rougeL_fmeasure": 0.0018075801702424836, "rougeL_fmeasure_stderr": 0.0004947733027310591, "rougeL_precision": 0.00201404952668169, "rougeL_precision_stderr": 0.0005627565099830248, "rougeL_recall": 0.0016845401365164876, "rougeL_recall_stderr": 0.0004566242212887019, "rougeLsum_fmeasure": 0.0018029021724255492, "rougeLsum_fmeasure_stderr": 0.0004971162944888186, "rougeLsum_precision": 0.002011541828046881, "rougeLsum_precision_stderr": 0.0005685335357244061, "rougeLsum_recall": 0.0016792198329612477, "rougeLsum_recall_stderr": 0.0004575199206063041}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3640781107267224, "bleu_stderr": 0.035526130232376905, "rouge1_fmeasure": 0.10446452400904378, "rouge1_fmeasure_stderr": 0.0021282046267782563, "rouge1_precision": 0.07014102967481332, "rouge1_precision_stderr": 0.0017603361253258872, "rouge1_recall": 0.28842029546681935, "rouge1_recall_stderr": 0.0049233552951936715, "rouge2_fmeasure": 0.0493613190828951, "rouge2_fmeasure_stderr": 0.0012678462201386074, "rouge2_precision": 0.0330408702642744, "rouge2_precision_stderr": 0.0010389544177208019, "rouge2_recall": 0.14098272377215726, "rouge2_recall_stderr": 0.003297340089435132, "rougeL_fmeasure": 0.10036321845998371, "rougeL_fmeasure_stderr": 0.0019766166905939206, "rougeL_precision": 0.06704364750637273, "rougeL_precision_stderr": 0.0016022359995174224, "rougeL_recall": 0.2800801497048708, "rougeL_recall_stderr": 0.004801675913591644, "rougeLsum_fmeasure": 0.09950407096550104, "rougeLsum_fmeasure_stderr": 0.001998952718407562, "rougeLsum_precision": 0.06676887269215705, "rougeLsum_precision_stderr": 0.0016525447643874539, "rougeLsum_recall": 0.2753731807713974, "rougeLsum_recall_stderr": 0.004658503872997521}}, "1": {"PALM_prompt": {"bleu": 0.43814070516160336, "bleu_stderr": 0.031903732201215625, "rouge1_fmeasure": 0.11401542749060949, "rouge1_fmeasure_stderr": 0.001998418757005679, "rouge1_precision": 0.07439507943093727, "rouge1_precision_stderr": 0.0016461947544769882, "rouge1_recall": 0.3627037154692145, "rouge1_recall_stderr": 0.005346241816796717, "rouge2_fmeasure": 0.05246738310334436, "rouge2_fmeasure_stderr": 0.0012383107743957698, "rouge2_precision": 0.03450570216297924, "rouge2_precision_stderr": 0.0010840106357965823, "rouge2_recall": 0.17166181107742665, "rouge2_recall_stderr": 0.0035320509500018442, "rougeL_fmeasure": 0.10684176105876576, "rougeL_fmeasure_stderr": 0.0017684199830638747, "rougeL_precision": 0.06956336032839978, "rougeL_precision_stderr": 0.0014791770438323157, "rougeL_recall": 0.34089838993238214, "rougeL_recall_stderr": 0.004895274848244494, "rougeLsum_fmeasure": 0.10858103762275753, "rougeLsum_fmeasure_stderr": 0.0018743087871579852, "rougeLsum_precision": 0.07090635820062817, "rougeLsum_precision_stderr": 0.0015619003235327967, "rougeLsum_recall": 0.3436647520266887, "rougeLsum_recall_stderr": 0.004921959631428134}}, "2": {"PALM_prompt": {"bleu": 0.5028711859163484, "bleu_stderr": 0.030787719545950616, "rouge1_fmeasure": 0.12054285310561563, "rouge1_fmeasure_stderr": 0.001972649226196246, "rouge1_precision": 0.07753757605359372, "rouge1_precision_stderr": 0.0015512644005857684, "rouge1_recall": 0.3860697308960553, "rouge1_recall_stderr": 0.005258277069941139, "rouge2_fmeasure": 0.05603019582407278, "rouge2_fmeasure_stderr": 0.0012315350771190166, "rouge2_precision": 0.036110775377632104, "rouge2_precision_stderr": 0.0010270494416056552, "rouge2_recall": 0.18882466978964585, "rouge2_recall_stderr": 0.00369639024938518, "rougeL_fmeasure": 0.11149048519432699, "rougeL_fmeasure_stderr": 0.00172065437166548, "rougeL_precision": 0.07153591340290212, "rougeL_precision_stderr": 0.0013649508297771586, "rougeL_recall": 0.3587064591062304, "rougeL_recall_stderr": 0.004743866412415114, "rougeLsum_fmeasure": 0.11463488060551481, "rougeLsum_fmeasure_stderr": 0.0018472065953168712, "rougeLsum_precision": 0.07375998187509147, "rougeLsum_precision_stderr": 0.0014640602061033832, "rougeLsum_recall": 0.366740909986174, "rougeLsum_recall_stderr": 0.0049215861808266865}}, "3": {"PALM_prompt": {"bleu": 0.5587796074505347, "bleu_stderr": 0.028253384481976996, "rouge1_fmeasure": 0.12013288464252138, "rouge1_fmeasure_stderr": 0.0019596437037062084, "rouge1_precision": 0.07698024296313602, "rouge1_precision_stderr": 0.0015328877673257805, "rouge1_recall": 0.3879851341856111, "rouge1_recall_stderr": 0.005282892413105376, "rouge2_fmeasure": 0.05577366815646396, "rouge2_fmeasure_stderr": 0.0012308874521381001, "rouge2_precision": 0.03563480049437639, "rouge2_precision_stderr": 0.0009489783954054023, "rouge2_recall": 0.19142284020103045, "rouge2_recall_stderr": 0.0037479421157118227, "rougeL_fmeasure": 0.11072845161345761, "rougeL_fmeasure_stderr": 0.0017220823748979428, "rougeL_precision": 0.07087425427017371, "rougeL_precision_stderr": 0.0013659902462074293, "rougeL_recall": 0.3584265264233966, "rougeL_recall_stderr": 0.004773698317842305, "rougeLsum_fmeasure": 0.11366561003670539, "rougeLsum_fmeasure_stderr": 0.0018354175463387527, "rougeLsum_precision": 0.07287440760306794, "rougeLsum_precision_stderr": 0.0014495161839448622, "rougeLsum_recall": 0.3664813728952017, "rougeLsum_recall_stderr": 0.004893732225893309}}, "4": {"PALM_prompt": {"bleu": 0.5825000714818155, "bleu_stderr": 0.04034581814245941, "rouge1_fmeasure": 0.12163862059782181, "rouge1_fmeasure_stderr": 0.0018761748416100347, "rouge1_precision": 0.07734202326611457, "rouge1_precision_stderr": 0.0014026697555017797, "rouge1_recall": 0.3934515503714071, "rouge1_recall_stderr": 0.005116597372321258, "rouge2_fmeasure": 0.05615553678250917, "rouge2_fmeasure_stderr": 0.0011765218173084135, "rouge2_precision": 0.03554662142275366, "rouge2_precision_stderr": 0.0008428630445647761, "rouge2_recall": 0.19226980177347533, "rouge2_recall_stderr": 0.0036534155547787617, "rougeL_fmeasure": 0.1118475941715947, "rougeL_fmeasure_stderr": 0.0016382655449843741, "rougeL_precision": 0.07104086842884898, "rougeL_precision_stderr": 0.0012211438709472402, "rougeL_recall": 0.36224536051274814, "rougeL_recall_stderr": 0.004588128068913386, "rougeLsum_fmeasure": 0.11526066260453778, "rougeLsum_fmeasure_stderr": 0.0017513742375990034, "rougeLsum_precision": 0.07329972341796974, "rougeLsum_precision_stderr": 0.0013094822661880972, "rougeLsum_recall": 0.3725146733994017, "rougeLsum_recall_stderr": 0.004743981973403336}}, "5": {"PALM_prompt": {"bleu": 0.6464476979550067, "bleu_stderr": 0.03340569659922027, "rouge1_fmeasure": 0.12087885693768408, "rouge1_fmeasure_stderr": 0.0018438017181407697, "rouge1_precision": 0.07640911074774853, "rouge1_precision_stderr": 0.0013513218301346682, "rouge1_recall": 0.39988107141167023, "rouge1_recall_stderr": 0.005297630864733688, "rouge2_fmeasure": 0.056241483469904825, "rouge2_fmeasure_stderr": 0.0011583569468954413, "rouge2_precision": 0.035338200721478794, "rouge2_precision_stderr": 0.0008181786430054925, "rouge2_recall": 0.19972811175935426, "rouge2_recall_stderr": 0.00378295966189437, "rougeL_fmeasure": 0.1110071767401602, "rougeL_fmeasure_stderr": 0.0016174705062074043, "rougeL_precision": 0.07014027016133682, "rougeL_precision_stderr": 0.0011852373127254551, "rougeL_recall": 0.3675015837582238, "rougeL_recall_stderr": 0.004706898670250232, "rougeLsum_fmeasure": 0.1140558109899182, "rougeLsum_fmeasure_stderr": 0.0017170337736459903, "rougeLsum_precision": 0.07214817551236571, "rougeLsum_precision_stderr": 0.0012624111393430876, "rougeLsum_recall": 0.3762402229794878, "rougeLsum_recall_stderr": 0.004831842536811778}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4785870749297974, "bleu_stderr": 0.058287705839301554, "rouge1_fmeasure": 0.17553793244289345, "rouge1_fmeasure_stderr": 0.001806401016559939, "rouge1_precision": 0.15044351613730478, "rouge1_precision_stderr": 0.001873937661358226, "rouge1_recall": 0.2541675583808366, "rouge1_recall_stderr": 0.002525767681285887, "rouge2_fmeasure": 0.03364466039752911, "rouge2_fmeasure_stderr": 0.0008133761132820285, "rouge2_precision": 0.028754049926889878, "rouge2_precision_stderr": 0.0007284335979063377, "rouge2_recall": 0.0498554560379571, "rouge2_recall_stderr": 0.0012994427687278028, "rougeL_fmeasure": 0.1387301222348297, "rougeL_fmeasure_stderr": 0.0012940665219750957, "rougeL_precision": 0.11754314152860335, "rougeL_precision_stderr": 0.0013223617006764188, "rougeL_recall": 0.20544370534958584, "rougeL_recall_stderr": 0.0020272468710765853, "rougeLsum_fmeasure": 0.16126585981961294, "rougeLsum_fmeasure_stderr": 0.0016467066936214445, "rougeLsum_precision": 0.13796688806438603, "rougeLsum_precision_stderr": 0.0017060133557980854, "rougeLsum_recall": 0.23453206833843981, "rougeLsum_recall_stderr": 0.002342224647799962}}, "1": {"tldr_en": {"bleu": 2.0467243985974637, "bleu_stderr": 0.07716877063896437, "rouge1_fmeasure": 0.1921501387436706, "rouge1_fmeasure_stderr": 0.00188282133972654, "rouge1_precision": 0.16583754031020287, "rouge1_precision_stderr": 0.0020211759897587646, "rouge1_recall": 0.2784359478170013, "rouge1_recall_stderr": 0.002692298233698752, "rouge2_fmeasure": 0.04146537149742157, "rouge2_fmeasure_stderr": 0.0008844838235072672, "rouge2_precision": 0.035939445160926, "rouge2_precision_stderr": 0.0008394970960275082, "rouge2_recall": 0.06220805715536351, "rouge2_recall_stderr": 0.0015155473184693795, "rougeL_fmeasure": 0.14062569600905112, "rougeL_fmeasure_stderr": 0.0012672119441252353, "rougeL_precision": 0.12014085192612638, "rougeL_precision_stderr": 0.0013614505956239505, "rougeL_recall": 0.2086744134191942, "rougeL_recall_stderr": 0.0020828906859784203, "rougeLsum_fmeasure": 0.179434656949378, "rougeLsum_fmeasure_stderr": 0.0017505927676760471, "rougeLsum_precision": 0.15464361675465313, "rougeLsum_precision_stderr": 0.001877438093158986, "rougeLsum_recall": 0.26102453302903145, "rougeLsum_recall_stderr": 0.0025492852995640664}}, "2": {"tldr_en": {"bleu": 2.320060128732629, "bleu_stderr": 0.08893019700806855, "rouge1_fmeasure": 0.19924741991114028, "rouge1_fmeasure_stderr": 0.0018379057773677573, "rouge1_precision": 0.1726215876854994, "rouge1_precision_stderr": 0.0020260105444747777, "rouge1_recall": 0.28926322433345203, "rouge1_recall_stderr": 0.0026664689340195023, "rouge2_fmeasure": 0.04476923922576134, "rouge2_fmeasure_stderr": 0.0009302798543204191, "rouge2_precision": 0.03857648929076035, "rouge2_precision_stderr": 0.0008639438064629973, "rouge2_recall": 0.0668419265131429, "rouge2_recall_stderr": 0.0015443469317622542, "rougeL_fmeasure": 0.14481404045263865, "rougeL_fmeasure_stderr": 0.0012560984379408465, "rougeL_precision": 0.12435019496309772, "rougeL_precision_stderr": 0.001371296770680872, "rougeL_recall": 0.21491826866017136, "rougeL_recall_stderr": 0.0020968332839467643, "rougeLsum_fmeasure": 0.18685212508946245, "rougeLsum_fmeasure_stderr": 0.0017201713555341734, "rougeLsum_precision": 0.161732111269423, "rougeLsum_precision_stderr": 0.0019007468899272617, "rougeLsum_recall": 0.2721000571947204, "rougeLsum_recall_stderr": 0.0025317136585981882}}, "3": {"tldr_en": {"bleu": 2.293615568382501, "bleu_stderr": 0.08330025806736717, "rouge1_fmeasure": 0.16846815995128117, "rouge1_fmeasure_stderr": 0.002131073495814494, "rouge1_precision": 0.15279328288706334, "rouge1_precision_stderr": 0.002367394441169836, "rouge1_recall": 0.24247674716724685, "rouge1_recall_stderr": 0.0031392522569694873, "rouge2_fmeasure": 0.03767208670961266, "rouge2_fmeasure_stderr": 0.0009011570298632758, "rouge2_precision": 0.03377044280894147, "rouge2_precision_stderr": 0.0009626959031858541, "rouge2_recall": 0.0561642215978211, "rouge2_recall_stderr": 0.0014849352445162754, "rougeL_fmeasure": 0.12239731708708615, "rougeL_fmeasure_stderr": 0.0014791592721134755, "rougeL_precision": 0.11083771641321827, "rougeL_precision_stderr": 0.001737490941024412, "rougeL_recall": 0.1801593730396781, "rougeL_recall_stderr": 0.0024158011806008285, "rougeLsum_fmeasure": 0.1577926976664846, "rougeLsum_fmeasure_stderr": 0.0019933237012214927, "rougeLsum_precision": 0.14303381078964403, "rougeLsum_precision_stderr": 0.0022285216909996414, "rougeLsum_recall": 0.2278951107867257, "rougeLsum_recall_stderr": 0.002975328208810723}}, "4": {"tldr_en": {"bleu": 0.5400646892787335, "bleu_stderr": 0.037286536411670276, "rouge1_fmeasure": 0.053580267784491944, "rouge1_fmeasure_stderr": 0.001838419022560825, "rouge1_precision": 0.05052662497660933, "rouge1_precision_stderr": 0.001976851052105952, "rouge1_recall": 0.07915452508070452, "rouge1_recall_stderr": 0.0027320931959636174, "rouge2_fmeasure": 0.012154113417992855, "rouge2_fmeasure_stderr": 0.0006275897699311847, "rouge2_precision": 0.011800340351831824, "rouge2_precision_stderr": 0.0008275418126856583, "rouge2_recall": 0.0182851744036474, "rouge2_recall_stderr": 0.0009958545079097947, "rougeL_fmeasure": 0.03985392369168753, "rougeL_fmeasure_stderr": 0.0013453967795167268, "rougeL_precision": 0.03782568360810854, "rougeL_precision_stderr": 0.0015222964082526145, "rougeL_recall": 0.060145492850265714, "rougeL_recall_stderr": 0.0020989219258759654, "rougeLsum_fmeasure": 0.050444560481311525, "rougeLsum_fmeasure_stderr": 0.0017303585091702936, "rougeLsum_precision": 0.04761426816521709, "rougeLsum_precision_stderr": 0.0018699392046644455, "rougeLsum_recall": 0.07466481391563054, "rougeLsum_recall_stderr": 0.0025793584133793814}}, "5": {"tldr_en": {"bleu": 8.569548966549661e-07, "bleu_stderr": 1.7490844062553856e-06, "rouge1_fmeasure": 0.008446833457743878, "rouge1_fmeasure_stderr": 0.0008128258304362509, "rouge1_precision": 0.007731794427912367, "rouge1_precision_stderr": 0.0007951245840495542, "rouge1_recall": 0.012761577644877838, "rouge1_recall_stderr": 0.0012392730943071788, "rouge2_fmeasure": 0.0020072557460750727, "rouge2_fmeasure_stderr": 0.00026547384624794787, "rouge2_precision": 0.0017913532527059574, "rouge2_precision_stderr": 0.00027132139800406437, "rouge2_recall": 0.003263180278324118, "rouge2_recall_stderr": 0.00046230831336980995, "rougeL_fmeasure": 0.0063490465986384935, "rougeL_fmeasure_stderr": 0.0006115233301676097, "rougeL_precision": 0.005821248286849475, "rougeL_precision_stderr": 0.000606460766053927, "rougeL_recall": 0.009782196731976658, "rougeL_recall_stderr": 0.0009702372753528633, "rougeLsum_fmeasure": 0.007847265251294862, "rougeLsum_fmeasure_stderr": 0.0007493677001603345, "rougeLsum_precision": 0.0071878578626772376, "rougeLsum_precision_stderr": 0.0007402306641308863, "rougeLsum_recall": 0.011960018902941532, "rougeLsum_recall_stderr": 0.001162152856806128}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.0956694980647792, "bleu_stderr": 0.04999557005701438, "rouge1_fmeasure": 0.09170375521862002, "rouge1_fmeasure_stderr": 0.0016027220200337815, "rouge1_precision": 0.09503245055218068, "rouge1_precision_stderr": 0.002227002440189141, "rouge1_recall": 0.12519204548417082, "rouge1_recall_stderr": 0.002257706262720428, "rouge2_fmeasure": 0.0200914197255096, "rouge2_fmeasure_stderr": 0.0007228266926632685, "rouge2_precision": 0.015747413529649493, "rouge2_precision_stderr": 0.0005857234957460363, "rouge2_recall": 0.03085419325647102, "rouge2_recall_stderr": 0.0011016936972703011, "rougeL_fmeasure": 0.08870793127413534, "rougeL_fmeasure_stderr": 0.0015148695280122193, "rougeL_precision": 0.09097585318147142, "rougeL_precision_stderr": 0.0020889683329724184, "rougeL_recall": 0.12191519471001734, "rougeL_recall_stderr": 0.002171780673661775, "rougeLsum_fmeasure": 0.0784374812184608, "rougeLsum_fmeasure_stderr": 0.0014050281578343764, "rougeLsum_precision": 0.0834425343644634, "rougeLsum_precision_stderr": 0.0020931903981870545, "rougeLsum_recall": 0.10608182715305749, "rougeLsum_recall_stderr": 0.0019274028911518155}}, "1": {"generate_text_restaurant": {"bleu": 10.151239164071983, "bleu_stderr": 0.1257334156643682, "rouge1_fmeasure": 0.40675497527393745, "rouge1_fmeasure_stderr": 0.002120685128829797, "rouge1_precision": 0.4698413043094089, "rouge1_precision_stderr": 0.0030273225025413382, "rouge1_recall": 0.40584950166038797, "rouge1_recall_stderr": 0.0029229476256733026, "rouge2_fmeasure": 0.17669097143978887, "rouge2_fmeasure_stderr": 0.001716223261618654, "rouge2_precision": 0.2074578784589061, "rouge2_precision_stderr": 0.0023047939137167727, "rouge2_recall": 0.17636894739489137, "rouge2_recall_stderr": 0.001969048658306312, "rougeL_fmeasure": 0.2915979204699439, "rougeL_fmeasure_stderr": 0.0017827978723647846, "rougeL_precision": 0.3404061024475899, "rougeL_precision_stderr": 0.0026532569303317583, "rougeL_recall": 0.2896879499504477, "rougeL_recall_stderr": 0.0022806434671987395, "rougeLsum_fmeasure": 0.33318221999557907, "rougeLsum_fmeasure_stderr": 0.0020291648036479103, "rougeLsum_precision": 0.3852954389535586, "rougeLsum_precision_stderr": 0.0028137809730615915, "rougeLsum_recall": 0.3327977299709147, "rougeLsum_recall_stderr": 0.002657870623602599}}, "2": {"generate_text_restaurant": {"bleu": 10.477786265685717, "bleu_stderr": 0.18647473053208208, "rouge1_fmeasure": 0.4181182679230384, "rouge1_fmeasure_stderr": 0.0020452985763851753, "rouge1_precision": 0.4750249856912901, "rouge1_precision_stderr": 0.003117749277972381, "rouge1_recall": 0.420167230887751, "rouge1_recall_stderr": 0.00277185209174179, "rouge2_fmeasure": 0.19044056117286426, "rouge2_fmeasure_stderr": 0.00168575182239935, "rouge2_precision": 0.22050994133149254, "rouge2_precision_stderr": 0.0023245851449860567, "rouge2_recall": 0.19130121956469073, "rouge2_recall_stderr": 0.001942626358068861, "rougeL_fmeasure": 0.2982970663330926, "rougeL_fmeasure_stderr": 0.0017726787771058774, "rougeL_precision": 0.3406569558070198, "rougeL_precision_stderr": 0.0026479989912893075, "rougeL_recall": 0.299702139963536, "rougeL_recall_stderr": 0.0022640373889766096, "rougeLsum_fmeasure": 0.34419023746163807, "rougeLsum_fmeasure_stderr": 0.0019847110018846874, "rougeLsum_precision": 0.3901601617973346, "rougeLsum_precision_stderr": 0.002815160547643611, "rougeLsum_recall": 0.3471499862247708, "rougeLsum_recall_stderr": 0.0026065622780937396}}, "3": {"generate_text_restaurant": {"bleu": 10.568317116179442, "bleu_stderr": 0.11556927070417884, "rouge1_fmeasure": 0.41842021335444884, "rouge1_fmeasure_stderr": 0.0020188593584297234, "rouge1_precision": 0.47159606393712117, "rouge1_precision_stderr": 0.003129868683115526, "rouge1_recall": 0.4234823828365637, "rouge1_recall_stderr": 0.0027467329857418952, "rouge2_fmeasure": 0.1938496862679729, "rouge2_fmeasure_stderr": 0.0016754810953723432, "rouge2_precision": 0.22231457931781198, "rouge2_precision_stderr": 0.0023199388284090745, "rouge2_recall": 0.19702786066961364, "rouge2_recall_stderr": 0.001990057035725131, "rougeL_fmeasure": 0.29696016514568546, "rougeL_fmeasure_stderr": 0.0017603697168916232, "rougeL_precision": 0.33656147022200217, "rougeL_precision_stderr": 0.002667737808938232, "rougeL_recall": 0.30066480270962387, "rougeL_recall_stderr": 0.0022622576572450003, "rougeLsum_fmeasure": 0.3438826400107135, "rougeLsum_fmeasure_stderr": 0.0019856692684705, "rougeLsum_precision": 0.38705644076443046, "rougeLsum_precision_stderr": 0.0028690431456947075, "rougeLsum_recall": 0.349206081387889, "rougeLsum_recall_stderr": 0.002597394859544418}}, "4": {"generate_text_restaurant": {"bleu": 10.581390958595872, "bleu_stderr": 0.16287225143334053, "rouge1_fmeasure": 0.4202226303244981, "rouge1_fmeasure_stderr": 0.0019805005077845462, "rouge1_precision": 0.46703203534176785, "rouge1_precision_stderr": 0.003027608948766856, "rouge1_recall": 0.42673885443033144, "rouge1_recall_stderr": 0.002653056579198862, "rouge2_fmeasure": 0.19550059496111732, "rouge2_fmeasure_stderr": 0.001681137850682549, "rouge2_precision": 0.221024427493211, "rouge2_precision_stderr": 0.0022668760796123264, "rouge2_recall": 0.19893689501234513, "rouge2_recall_stderr": 0.0019645186911920118, "rougeL_fmeasure": 0.2960631555841986, "rougeL_fmeasure_stderr": 0.00175639502051002, "rougeL_precision": 0.3305021376136181, "rougeL_precision_stderr": 0.0025711911253961032, "rougeL_recall": 0.30080598135342596, "rougeL_recall_stderr": 0.002216619700518367, "rougeLsum_fmeasure": 0.34711141302754794, "rougeLsum_fmeasure_stderr": 0.0019655722013055365, "rougeLsum_precision": 0.3848753584944118, "rougeLsum_precision_stderr": 0.0027594334460907116, "rougeLsum_recall": 0.35379992432844626, "rougeLsum_recall_stderr": 0.002554582589973131}}, "5": {"generate_text_restaurant": {"bleu": 10.161698634389158, "bleu_stderr": 0.13251687133818035, "rouge1_fmeasure": 0.41690846563222644, "rouge1_fmeasure_stderr": 0.0019329282633560236, "rouge1_precision": 0.45350802506846094, "rouge1_precision_stderr": 0.0029332747892263492, "rouge1_recall": 0.4298932002561076, "rouge1_recall_stderr": 0.0025654012473086065, "rouge2_fmeasure": 0.19180440349615147, "rouge2_fmeasure_stderr": 0.0016038971120681744, "rouge2_precision": 0.21155169806401755, "rouge2_precision_stderr": 0.0021213554539015983, "rouge2_recall": 0.19815792477697378, "rouge2_recall_stderr": 0.0018697079815536174, "rougeL_fmeasure": 0.2943936910199933, "rougeL_fmeasure_stderr": 0.001703424504126945, "rougeL_precision": 0.3208681338440674, "rougeL_precision_stderr": 0.0024465770749067034, "rougeL_recall": 0.3044498221521836, "rougeL_recall_stderr": 0.0021841272279665724, "rougeLsum_fmeasure": 0.3474979202384499, "rougeLsum_fmeasure_stderr": 0.0019204154404832128, "rougeLsum_precision": 0.37693689810608394, "rougeLsum_precision_stderr": 0.0026776647538697042, "rougeLsum_recall": 0.3598773179164774, "rougeLsum_recall_stderr": 0.002490089475383021}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8522377895450308, "bleu_stderr": 0.07723955076500588, "rouge1_fmeasure": 0.20202631280258004, "rouge1_fmeasure_stderr": 0.0024569661256867497, "rouge1_precision": 0.1530345030359518, "rouge1_precision_stderr": 0.002115037282090291, "rouge1_recall": 0.3297522507077294, "rouge1_recall_stderr": 0.004233264868359823, "rouge2_fmeasure": 0.0438628207106013, "rouge2_fmeasure_stderr": 0.0015033748211488044, "rouge2_precision": 0.03260120789118233, "rouge2_precision_stderr": 0.0011626221940092174, "rouge2_recall": 0.07472054203573325, "rouge2_recall_stderr": 0.002669966683230418, "rougeL_fmeasure": 0.15285396489063718, "rougeL_fmeasure_stderr": 0.0018457457908837322, "rougeL_precision": 0.1154609949889405, "rougeL_precision_stderr": 0.001570276581336417, "rougeL_recall": 0.2514593427654629, "rougeL_recall_stderr": 0.003335220546206934, "rougeLsum_fmeasure": 0.15642992761683933, "rougeLsum_fmeasure_stderr": 0.0020624169809871596, "rougeLsum_precision": 0.11796967069833372, "rougeLsum_precision_stderr": 0.0016923604745038886, "rougeLsum_recall": 0.2574959975250046, "rougeLsum_recall_stderr": 0.0037084065360991183}}, "1": {"article_DOC_summary": {"bleu": 1.2604705578203963, "bleu_stderr": 0.06124796466601407, "rouge1_fmeasure": 0.16964759605780336, "rouge1_fmeasure_stderr": 0.002326270519236544, "rouge1_precision": 0.12043203461580464, "rouge1_precision_stderr": 0.0017443586213575642, "rouge1_recall": 0.2990330815832083, "rouge1_recall_stderr": 0.00392004786403155, "rouge2_fmeasure": 0.03145088602415999, "rouge2_fmeasure_stderr": 0.001268842216195083, "rouge2_precision": 0.022185940713628696, "rouge2_precision_stderr": 0.0009035076589227027, "rouge2_recall": 0.05647340976722095, "rouge2_recall_stderr": 0.0022888090744372525, "rougeL_fmeasure": 0.13279585103698274, "rougeL_fmeasure_stderr": 0.0017386360127803384, "rougeL_precision": 0.0940358491286338, "rougeL_precision_stderr": 0.0012896725062631583, "rougeL_recall": 0.23588081649114553, "rougeL_recall_stderr": 0.003076131089195475, "rougeLsum_fmeasure": 0.13714896670693033, "rougeLsum_fmeasure_stderr": 0.001891141826576318, "rougeLsum_precision": 0.09715832932274611, "rougeLsum_precision_stderr": 0.0014027160695998882, "rougeLsum_recall": 0.24318297005771927, "rougeLsum_recall_stderr": 0.0032866076983934975}}, "2": {"article_DOC_summary": {"bleu": 1.1993548757142343, "bleu_stderr": 0.07506740455601302, "rouge1_fmeasure": 0.16618737976479214, "rouge1_fmeasure_stderr": 0.002276816920530847, "rouge1_precision": 0.11769325953014931, "rouge1_precision_stderr": 0.0017015278474607515, "rouge1_recall": 0.29438013695551873, "rouge1_recall_stderr": 0.0038789493089544562, "rouge2_fmeasure": 0.029880831851623216, "rouge2_fmeasure_stderr": 0.00122944703122044, "rouge2_precision": 0.02096426606557719, "rouge2_precision_stderr": 0.0008665435069084103, "rouge2_recall": 0.05433482039565291, "rouge2_recall_stderr": 0.0023105213250519683, "rougeL_fmeasure": 0.12975792447787388, "rougeL_fmeasure_stderr": 0.0017033041521276769, "rougeL_precision": 0.09164613863479107, "rougeL_precision_stderr": 0.0012536491969391975, "rougeL_recall": 0.23162945469437568, "rougeL_recall_stderr": 0.003070926511833877, "rougeLsum_fmeasure": 0.1335903464477578, "rougeLsum_fmeasure_stderr": 0.0018357495291236787, "rougeLsum_precision": 0.09433314094899128, "rougeLsum_precision_stderr": 0.001348427130905447, "rougeLsum_recall": 0.2385512346805842, "rougeLsum_recall_stderr": 0.0032940278213274467}}, "3": {"article_DOC_summary": {"bleu": 1.2211825031010066, "bleu_stderr": 0.09154640955268176, "rouge1_fmeasure": 0.15920600367020823, "rouge1_fmeasure_stderr": 0.002481979093247659, "rouge1_precision": 0.11565156919649207, "rouge1_precision_stderr": 0.0019801993699536105, "rouge1_recall": 0.2756237703410958, "rouge1_recall_stderr": 0.004182876892512918, "rouge2_fmeasure": 0.02850791752436153, "rouge2_fmeasure_stderr": 0.0012936908884457888, "rouge2_precision": 0.020336317588370855, "rouge2_precision_stderr": 0.0009274120929811008, "rouge2_recall": 0.05057209548284876, "rouge2_recall_stderr": 0.002351218721951043, "rougeL_fmeasure": 0.12499127552831453, "rougeL_fmeasure_stderr": 0.0018804015974190902, "rougeL_precision": 0.09058001935560847, "rougeL_precision_stderr": 0.0014865597899847306, "rougeL_recall": 0.21793077385082366, "rougeL_recall_stderr": 0.003303059349423484, "rougeLsum_fmeasure": 0.1277963893933807, "rougeLsum_fmeasure_stderr": 0.0020055497561999075, "rougeLsum_precision": 0.09255803892135304, "rougeLsum_precision_stderr": 0.0015654263100926082, "rougeLsum_recall": 0.2227941148100558, "rougeLsum_recall_stderr": 0.0035075623884690582}}, "4": {"article_DOC_summary": {"bleu": 0.6391086739448858, "bleu_stderr": 0.1700476603076541, "rouge1_fmeasure": 0.0422631600216737, "rouge1_fmeasure_stderr": 0.0023710097459347006, "rouge1_precision": 0.03604653885002255, "rouge1_precision_stderr": 0.002296117864033233, "rouge1_recall": 0.06660229202514774, "rouge1_recall_stderr": 0.0038110111174951616, "rouge2_fmeasure": 0.006851248257616251, "rouge2_fmeasure_stderr": 0.0007463712089541968, "rouge2_precision": 0.0059458501046395435, "rouge2_precision_stderr": 0.0008340086797958044, "rouge2_recall": 0.011405807073573361, "rouge2_recall_stderr": 0.001256767237688611, "rougeL_fmeasure": 0.03322511331159771, "rougeL_fmeasure_stderr": 0.001836464148903972, "rougeL_precision": 0.029041397090136103, "rougeL_precision_stderr": 0.0019748578000551753, "rougeL_recall": 0.0526320135913171, "rougeL_recall_stderr": 0.0030189987868325676, "rougeLsum_fmeasure": 0.035345564934779486, "rougeLsum_fmeasure_stderr": 0.0019873313312401414, "rougeLsum_precision": 0.03074792573874264, "rougeLsum_precision_stderr": 0.0020676689105202572, "rougeLsum_recall": 0.055745471113257235, "rougeLsum_recall_stderr": 0.0032243196958437617}}, "5": {"article_DOC_summary": {"bleu": 1.0632885974270816e-38, "bleu_stderr": 2.8992106826594798e-33, "rouge1_fmeasure": 0.002353715811942938, "rouge1_fmeasure_stderr": 0.0006677511768602319, "rouge1_precision": 0.002662477701177392, "rouge1_precision_stderr": 0.0007829468225788233, "rouge1_recall": 0.0021639115396153217, "rouge1_recall_stderr": 0.0006001713237417921, "rouge2_fmeasure": 0.0002533740120631878, "rouge2_fmeasure_stderr": 0.00011551517378567248, "rouge2_precision": 0.000298209621184094, "rouge2_precision_stderr": 0.00013919588909238172, "rouge2_recall": 0.00022427781861744122, "rouge2_recall_stderr": 0.00010143261738415956, "rougeL_fmeasure": 0.0018075801702424836, "rougeL_fmeasure_stderr": 0.0004947733027310591, "rougeL_precision": 0.00201404952668169, "rougeL_precision_stderr": 0.0005627565099830248, "rougeL_recall": 0.0016845401365164876, "rougeL_recall_stderr": 0.0004566242212887019, "rougeLsum_fmeasure": 0.0018029021724255492, "rougeLsum_fmeasure_stderr": 0.0004971162944888186, "rougeLsum_precision": 0.002011541828046881, "rougeLsum_precision_stderr": 0.0005685335357244061, "rougeLsum_recall": 0.0016792198329612477, "rougeLsum_recall_stderr": 0.0004575199206063041}}}}
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_2.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.322,0.014782913600996666,0
3
  anli_r2,acc,0.319,0.014746404865473477,0
4
  anli_r3,acc,0.3333333333333333,0.013613950010225608,0
 
 
 
 
 
5
  cb,acc,0.4107142857142857,0.0663363415035954,1
6
  cb,f1,0.2374551971326165,,1
7
  copa,acc,0.77,0.04229525846816506,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.322,0.014782913600996666,0
3
  anli_r2,acc,0.319,0.014746404865473477,0
4
  anli_r3,acc,0.3333333333333333,0.013613950010225608,0
5
+ arc_challenge,acc,0.26535836177474403,0.012902554762313966,0
6
+ arc_challenge,acc_norm,0.3003412969283277,0.013395909309957004,0
7
+ arc_easy,acc,0.5921717171717171,0.010083950240041214,0
8
+ arc_easy,acc_norm,0.5635521885521886,0.010176569980111044,0
9
+ boolq,acc,0.5801223241590214,0.008632045504781747,1
10
  cb,acc,0.4107142857142857,0.0663363415035954,1
11
  cb,f1,0.2374551971326165,,1
12
  copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.42949611631149176,0.004939925958728874,0
14
+ hellaswag,acc_norm,0.5612427803226449,0.004952209831856574,0
15
+ piqa,acc,0.7372143634385201,0.010269354068140767,0
16
+ piqa,acc_norm,0.7404787812840044,0.010227939888173923,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.876,0.01042749887234397,0
19
+ sciq,acc_norm,0.852,0.011234866364235253,0
20
+ storycloze_2016,acc,0.6846606092998396,0.010744989116260668,0
21
+ winogrande,acc,0.5666929755327546,0.013926915052757342,0
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_2.json CHANGED
@@ -34,6 +34,38 @@
34
  "winogrande": {
35
  "acc": 0.5666929755327546,
36
  "acc_stderr": 0.013926915052757342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
  },
39
  "versions": {
@@ -44,6 +76,12 @@
44
  "copa": 0,
45
  "hellaswag": 0,
46
  "rte": 0,
47
- "winogrande": 0
 
 
 
 
 
 
48
  }
49
  }
 
34
  "winogrande": {
35
  "acc": 0.5666929755327546,
36
  "acc_stderr": 0.013926915052757342
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6846606092998396,
40
+ "acc_stderr": 0.010744989116260668
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5801223241590214,
44
+ "acc_stderr": 0.008632045504781747
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5921717171717171,
48
+ "acc_stderr": 0.010083950240041214,
49
+ "acc_norm": 0.5635521885521886,
50
+ "acc_norm_stderr": 0.010176569980111044
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26535836177474403,
54
+ "acc_stderr": 0.012902554762313966,
55
+ "acc_norm": 0.3003412969283277,
56
+ "acc_norm_stderr": 0.013395909309957004
57
+ },
58
+ "sciq": {
59
+ "acc": 0.876,
60
+ "acc_stderr": 0.01042749887234397,
61
+ "acc_norm": 0.852,
62
+ "acc_norm_stderr": 0.011234866364235253
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7372143634385201,
66
+ "acc_stderr": 0.010269354068140767,
67
+ "acc_norm": 0.7404787812840044,
68
+ "acc_norm_stderr": 0.010227939888173923
69
  }
70
  },
71
  "versions": {
 
76
  "copa": 0,
77
  "hellaswag": 0,
78
  "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_2_lm-eval_global_step52452_2023-02-25-10-40-28_2shots_backup.json DELETED
@@ -1,49 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.322,
5
- "acc_stderr": 0.014782913600996666
6
- },
7
- "anli_r2": {
8
- "acc": 0.319,
9
- "acc_stderr": 0.014746404865473477
10
- },
11
- "anli_r3": {
12
- "acc": 0.3333333333333333,
13
- "acc_stderr": 0.013613950010225608
14
- },
15
- "cb": {
16
- "acc": 0.4107142857142857,
17
- "acc_stderr": 0.0663363415035954,
18
- "f1": 0.2374551971326165
19
- },
20
- "copa": {
21
- "acc": 0.77,
22
- "acc_stderr": 0.04229525846816506
23
- },
24
- "hellaswag": {
25
- "acc": 0.42949611631149176,
26
- "acc_stderr": 0.004939925958728874,
27
- "acc_norm": 0.5612427803226449,
28
- "acc_norm_stderr": 0.004952209831856574
29
- },
30
- "rte": {
31
- "acc": 0.51985559566787,
32
- "acc_stderr": 0.030072723167317184
33
- },
34
- "winogrande": {
35
- "acc": 0.5666929755327546,
36
- "acc_stderr": 0.013926915052757342
37
- }
38
- },
39
- "versions": {
40
- "anli_r1": 0,
41
- "anli_r2": 0,
42
- "anli_r3": 0,
43
- "cb": 1,
44
- "copa": 0,
45
- "hellaswag": 0,
46
- "rte": 0,
47
- "winogrande": 0
48
- }
49
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_3.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.313,0.014671272822977888,0
3
  anli_r2,acc,0.335,0.014933117490932577,0
4
  anli_r3,acc,0.33,0.013579531277800922,0
 
 
 
 
 
5
  cb,acc,0.39285714285714285,0.0658538889806635,1
6
  cb,f1,0.29543049543049543,,1
7
  copa,acc,0.73,0.044619604333847394,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.313,0.014671272822977888,0
3
  anli_r2,acc,0.335,0.014933117490932577,0
4
  anli_r3,acc,0.33,0.013579531277800922,0
5
+ arc_challenge,acc,0.25853242320819114,0.012794553754288686,0
6
+ arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0
7
+ arc_easy,acc,0.6001683501683501,0.010051788039412918,0
8
+ arc_easy,acc_norm,0.5715488215488216,0.010154195733990965,0
9
+ boolq,acc,0.5758409785932722,0.008643869023388127,1
10
  cb,acc,0.39285714285714285,0.0658538889806635,1
11
  cb,f1,0.29543049543049543,,1
12
  copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.43228440549691294,0.004943809330692693,0
14
+ hellaswag,acc_norm,0.5619398526190001,0.004951346338164485,0
15
+ piqa,acc,0.7415669205658324,0.010213971636773326,0
16
+ piqa,acc_norm,0.7393906420021763,0.01024182615581163,0
17
+ rte,acc,0.5090252707581228,0.030091559826331334,0
18
+ sciq,acc,0.877,0.010391293421849876,0
19
+ sciq,acc_norm,0.864,0.01084535023047299,0
20
+ storycloze_2016,acc,0.6819882415820417,0.010769343495248537,0
21
+ winogrande,acc,0.5540647198105761,0.013970093482330699,0
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_3.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.73,
22
  "acc_stderr": 0.044619604333847394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.73,
22
  "acc_stderr": 0.044619604333847394
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.43228440549691294,
26
+ "acc_stderr": 0.004943809330692693,
27
+ "acc_norm": 0.5619398526190001,
28
+ "acc_norm_stderr": 0.004951346338164485
29
+ },
30
+ "rte": {
31
+ "acc": 0.5090252707581228,
32
+ "acc_stderr": 0.030091559826331334
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5540647198105761,
36
+ "acc_stderr": 0.013970093482330699
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6819882415820417,
40
+ "acc_stderr": 0.010769343495248537
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5758409785932722,
44
+ "acc_stderr": 0.008643869023388127
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.6001683501683501,
48
+ "acc_stderr": 0.010051788039412918,
49
+ "acc_norm": 0.5715488215488216,
50
+ "acc_norm_stderr": 0.010154195733990965
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.25853242320819114,
54
+ "acc_stderr": 0.012794553754288686,
55
+ "acc_norm": 0.2764505119453925,
56
+ "acc_norm_stderr": 0.013069662474252425
57
+ },
58
+ "sciq": {
59
+ "acc": 0.877,
60
+ "acc_stderr": 0.010391293421849876,
61
+ "acc_norm": 0.864,
62
+ "acc_norm_stderr": 0.01084535023047299
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7415669205658324,
66
+ "acc_stderr": 0.010213971636773326,
67
+ "acc_norm": 0.7393906420021763,
68
+ "acc_norm_stderr": 0.01024182615581163
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_4.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.346,0.015050266127564438,0
3
  anli_r2,acc,0.317,0.014721675438880227,0
4
  anli_r3,acc,0.33916666666666667,0.01367234349168182,0
 
 
 
 
 
5
  cb,acc,0.375,0.06527912098338669,1
6
  cb,f1,0.2047930283224401,,1
7
  copa,acc,0.76,0.04292346959909282,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.346,0.015050266127564438,0
3
  anli_r2,acc,0.317,0.014721675438880227,0
4
  anli_r3,acc,0.33916666666666667,0.01367234349168182,0
5
+ arc_challenge,acc,0.26706484641638223,0.01292893319649636,0
6
+ arc_challenge,acc_norm,0.2935153583617747,0.013307250444941117,0
7
+ arc_easy,acc,0.5976430976430976,0.010062244711011524,0
8
+ arc_easy,acc_norm,0.5736531986531986,0.010147858603835143,0
9
+ boolq,acc,0.5724770642201835,0.00865269299717733,1
10
  cb,acc,0.375,0.06527912098338669,1
11
  cb,f1,0.2047930283224401,,1
12
  copa,acc,0.76,0.04292346959909282,0
13
+ hellaswag,acc,0.4305915156343358,0.004941470620074857,0
14
+ hellaswag,acc_norm,0.5611431985660227,0.0049523323781203235,0
15
+ piqa,acc,0.7312295973884657,0.010343392940090011,0
16
+ piqa,acc_norm,0.735582154515778,0.010289787244767156,0
17
+ rte,acc,0.47653429602888087,0.030063300411902652,0
18
+ sciq,acc,0.883,0.010169287802713329,0
19
+ sciq,acc_norm,0.874,0.010499249222408054,0
20
+ storycloze_2016,acc,0.6916087653661144,0.010679734445487796,0
21
+ winogrande,acc,0.55327545382794,0.013972488371616696,0
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_4.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.04292346959909282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.04292346959909282
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4305915156343358,
26
+ "acc_stderr": 0.004941470620074857,
27
+ "acc_norm": 0.5611431985660227,
28
+ "acc_norm_stderr": 0.0049523323781203235
29
+ },
30
+ "rte": {
31
+ "acc": 0.47653429602888087,
32
+ "acc_stderr": 0.030063300411902652
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.55327545382794,
36
+ "acc_stderr": 0.013972488371616696
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6916087653661144,
40
+ "acc_stderr": 0.010679734445487796
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5724770642201835,
44
+ "acc_stderr": 0.00865269299717733
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5976430976430976,
48
+ "acc_stderr": 0.010062244711011524,
49
+ "acc_norm": 0.5736531986531986,
50
+ "acc_norm_stderr": 0.010147858603835143
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26706484641638223,
54
+ "acc_stderr": 0.01292893319649636,
55
+ "acc_norm": 0.2935153583617747,
56
+ "acc_norm_stderr": 0.013307250444941117
57
+ },
58
+ "sciq": {
59
+ "acc": 0.883,
60
+ "acc_stderr": 0.010169287802713329,
61
+ "acc_norm": 0.874,
62
+ "acc_norm_stderr": 0.010499249222408054
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7312295973884657,
66
+ "acc_stderr": 0.010343392940090011,
67
+ "acc_norm": 0.735582154515778,
68
+ "acc_norm_stderr": 0.010289787244767156
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.325,0.014818724459095524,0
3
  anli_r2,acc,0.328,0.014853842487270336,0
4
  anli_r3,acc,0.3175,0.013443538681348054,0
 
 
 
 
 
5
  cb,acc,0.39285714285714285,0.0658538889806635,1
6
  cb,f1,0.22058422058422059,,1
7
  copa,acc,0.78,0.04163331998932262,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.325,0.014818724459095524,0
3
  anli_r2,acc,0.328,0.014853842487270336,0
4
  anli_r3,acc,0.3175,0.013443538681348054,0
5
+ arc_challenge,acc,0.2696245733788396,0.012968040686869152,0
6
+ arc_challenge,acc_norm,0.3003412969283277,0.013395909309956997,0
7
+ arc_easy,acc,0.5989057239057239,0.010057051106534367,0
8
+ arc_easy,acc_norm,0.5749158249158249,0.010143966195717844,0
9
+ boolq,acc,0.5737003058103975,0.00864953162580567,1
10
  cb,acc,0.39285714285714285,0.0658538889806635,1
11
  cb,f1,0.22058422058422059,,1
12
  copa,acc,0.78,0.04163331998932262,0
13
+ hellaswag,acc,0.43108942441744674,0.004942164585991471,0
14
+ hellaswag,acc_norm,0.5640310695080661,0.004948696280312417,0
15
+ piqa,acc,0.7225244831338411,0.010446818281039952,0
16
+ piqa,acc_norm,0.7393906420021763,0.010241826155811633,0
17
+ rte,acc,0.5379061371841155,0.03000984891252912,0
18
+ sciq,acc,0.893,0.009779910359847167,0
19
+ sciq,acc_norm,0.88,0.010281328012747377,0
20
+ storycloze_2016,acc,0.6862640299305185,0.010730179119317632,0
21
+ winogrande,acc,0.5619573796369376,0.013944181296470804,0
2b855b9bc4seed3/evaluation/rankeval/2b855b9bc4seed3_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.78,
22
  "acc_stderr": 0.04163331998932262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.78,
22
  "acc_stderr": 0.04163331998932262
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.43108942441744674,
26
+ "acc_stderr": 0.004942164585991471,
27
+ "acc_norm": 0.5640310695080661,
28
+ "acc_norm_stderr": 0.004948696280312417
29
+ },
30
+ "rte": {
31
+ "acc": 0.5379061371841155,
32
+ "acc_stderr": 0.03000984891252912
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5619573796369376,
36
+ "acc_stderr": 0.013944181296470804
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6862640299305185,
40
+ "acc_stderr": 0.010730179119317632
41
+ },
42
+ "boolq": {
43
+ "acc": 0.5737003058103975,
44
+ "acc_stderr": 0.00864953162580567
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5989057239057239,
48
+ "acc_stderr": 0.010057051106534367,
49
+ "acc_norm": 0.5749158249158249,
50
+ "acc_norm_stderr": 0.010143966195717844
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.2696245733788396,
54
+ "acc_stderr": 0.012968040686869152,
55
+ "acc_norm": 0.3003412969283277,
56
+ "acc_norm_stderr": 0.013395909309956997
57
+ },
58
+ "sciq": {
59
+ "acc": 0.893,
60
+ "acc_stderr": 0.009779910359847167,
61
+ "acc_norm": 0.88,
62
+ "acc_norm_stderr": 0.010281328012747377
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7225244831338411,
66
+ "acc_stderr": 0.010446818281039952,
67
+ "acc_norm": 0.7393906420021763,
68
+ "acc_norm_stderr": 0.010241826155811633
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed4/evaluation/generation/merged.csv CHANGED
@@ -27,9 +27,27 @@ gem_xsum,5,median,rouge2_fmeasure,0.0002620545073375262
27
  gem_xsum,5,average,multiple,0.025281962304915615
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05373073643615162
29
  web_nlg_en,0,median,rouge2_fmeasure,0.05373073643615162
30
- web_nlg_en,0,average,multiple,0.05373073643615162
 
 
 
 
 
 
 
 
 
 
31
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03224922808684278
32
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03224922808684278
33
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0450115763618159
34
  wiki_lingua_en,1,median,rouge2_fmeasure,0.0450115763618159
35
- wiki_lingua_en,1,average,multiple,0.03863040222432934
 
 
 
 
 
 
 
 
 
27
  gem_xsum,5,average,multiple,0.025281962304915615
28
  web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05373073643615162
29
  web_nlg_en,0,median,rouge2_fmeasure,0.05373073643615162
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.053953733277074825
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.053953733277074825
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.055938501058548445
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.055938501058548445
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05687543071165634
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05687543071165634
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05727893718775754
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05727893718775754
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.061290902793478194
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.061290902793478194
40
+ web_nlg_en,5,average,multiple,0.0565113735774445
41
  wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03224922808684278
42
  wiki_lingua_en,0,median,rouge2_fmeasure,0.03224922808684278
43
  wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0450115763618159
44
  wiki_lingua_en,1,median,rouge2_fmeasure,0.0450115763618159
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.046991077091615464
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.046991077091615464
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.041415038125264154
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.041415038125264154
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013029546490366375
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013029546490366375
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0019206264691365757
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0019206264691365757
53
+ wiki_lingua_en,5,average,multiple,0.03010284877084021
2b855b9bc4seed4/evaluation/generation/merged.json CHANGED
@@ -1 +1 @@
1
- {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3977812770960404, "bleu_stderr": 0.042773776393969704, "rouge1_fmeasure": 0.11321310063151303, "rouge1_fmeasure_stderr": 0.0022279121821432222, "rouge1_precision": 0.0769744998872799, "rouge1_precision_stderr": 0.0019295721946577652, "rouge1_recall": 0.3084217053839128, "rouge1_recall_stderr": 0.004775900652467882, "rouge2_fmeasure": 0.05373073643615162, "rouge2_fmeasure_stderr": 0.0014061908002077534, "rouge2_precision": 0.036029038573922645, "rouge2_precision_stderr": 0.0011314658097909978, "rouge2_recall": 0.14884335480613473, "rouge2_recall_stderr": 0.0032559284273627463, "rougeL_fmeasure": 0.10876944861868645, "rougeL_fmeasure_stderr": 0.0020435744759205866, "rougeL_precision": 0.07364934603473708, "rougeL_precision_stderr": 0.0017837348414022293, "rougeL_recall": 0.30006083515709553, "rougeL_recall_stderr": 0.004651155763795223, "rougeLsum_fmeasure": 0.10778741619118525, "rougeLsum_fmeasure_stderr": 0.0020874022925801857, "rougeLsum_precision": 0.07333854015351937, "rougeLsum_precision_stderr": 0.0018346409882236817, "rougeLsum_recall": 0.2940880425231954, "rougeLsum_recall_stderr": 0.00448763966555627}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.3265737143060392, "bleu_stderr": 0.05144732865628208, "rouge1_fmeasure": 0.17044572730648816, "rouge1_fmeasure_stderr": 0.0017736206602746725, "rouge1_precision": 0.1455553771831484, "rouge1_precision_stderr": 0.001822183469048219, "rouge1_recall": 0.24842139858721166, "rouge1_recall_stderr": 0.002581896458749269, "rouge2_fmeasure": 0.03224922808684278, "rouge2_fmeasure_stderr": 0.0007992587057440431, "rouge2_precision": 0.02746904348907652, "rouge2_precision_stderr": 0.000734949838478965, "rouge2_recall": 0.049293404741380026, "rouge2_recall_stderr": 0.0014019141580128261, "rougeL_fmeasure": 0.1361714573185675, "rougeL_fmeasure_stderr": 0.0012895027270174906, "rougeL_precision": 0.1149161071050528, "rougeL_precision_stderr": 0.001304785756648923, "rougeL_recall": 0.20346388626158698, "rougeL_recall_stderr": 0.0021419836616257755, "rougeLsum_fmeasure": 0.1565676013933749, "rougeLsum_fmeasure_stderr": 0.001602387883565948, "rougeLsum_precision": 0.1334840411046874, "rougeLsum_precision_stderr": 0.0016521020390246263, "rougeLsum_recall": 0.2294411836470692, "rougeLsum_recall_stderr": 0.0023999510628294886}}, "1": {"tldr_en": {"bleu": 2.246279205589283, "bleu_stderr": 0.08591459030255512, "rouge1_fmeasure": 0.1935542802699907, "rouge1_fmeasure_stderr": 0.001958003782927919, "rouge1_precision": 0.1751445150209056, "rouge1_precision_stderr": 0.0023551193979363416, "rouge1_recall": 0.2732696965442855, "rouge1_recall_stderr": 0.0027342348888761856, "rouge2_fmeasure": 0.0450115763618159, "rouge2_fmeasure_stderr": 0.0009707150689037455, "rouge2_precision": 0.04234533338241678, "rouge2_precision_stderr": 0.0012533321285062282, "rouge2_recall": 0.06525005846041558, "rouge2_recall_stderr": 0.001538887481538743, "rougeL_fmeasure": 0.14672861716843905, "rougeL_fmeasure_stderr": 0.0013770639859191664, "rougeL_precision": 0.13216089943364137, "rougeL_precision_stderr": 0.001774044309331969, "rougeL_recall": 0.21158218924860497, "rougeL_recall_stderr": 0.0021608610753667597, "rougeLsum_fmeasure": 0.18090547445662525, "rougeLsum_fmeasure_stderr": 0.0018247563264741187, "rougeLsum_precision": 0.16344934405028377, "rougeLsum_precision_stderr": 0.0022011880764864523, "rougeLsum_recall": 0.25625768066865473, "rougeLsum_recall_stderr": 0.0025841726330260587}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.8316383531304311, "bleu_stderr": 0.07290512983250645, "rouge1_fmeasure": 0.12401829493831736, "rouge1_fmeasure_stderr": 0.0013356050583825897, "rouge1_precision": 0.17215029205196647, "rouge1_precision_stderr": 0.003188745876763668, "rouge1_recall": 0.14132270074178124, "rouge1_recall_stderr": 0.0014347807622836861, "rouge2_fmeasure": 0.013839592252368907, "rouge2_fmeasure_stderr": 0.0006744998388324607, "rouge2_precision": 0.02685112433131068, "rouge2_precision_stderr": 0.002069788903045141, "rouge2_recall": 0.01555067230449375, "rouge2_recall_stderr": 0.0007871809267807966, "rougeL_fmeasure": 0.1165625741372221, "rougeL_fmeasure_stderr": 0.0011694039904290299, "rougeL_precision": 0.15898310735132792, "rougeL_precision_stderr": 0.0029356787079883426, "rougeL_recall": 0.13514593742293743, "rougeL_recall_stderr": 0.001334864171571462, "rougeLsum_fmeasure": 0.10325706305255843, "rougeLsum_fmeasure_stderr": 0.0012798538152521957, "rougeLsum_precision": 0.15028738084951368, "rougeLsum_precision_stderr": 0.003111744841937638, "rougeLsum_recall": 0.11499752781885914, "rougeLsum_recall_stderr": 0.001289170268567555}}, "1": {"generate_text_restaurant": {"bleu": 4.78157811483512, "bleu_stderr": 0.06272692163269489, "rouge1_fmeasure": 0.29266072307096785, "rouge1_fmeasure_stderr": 0.0017871825084829352, "rouge1_precision": 0.23312078822753843, "rouge1_precision_stderr": 0.0018885735159246298, "rouge1_recall": 0.4396892491698765, "rouge1_recall_stderr": 0.002717067470795004, "rouge2_fmeasure": 0.11567035973458402, "rouge2_fmeasure_stderr": 0.0012191850944871271, "rouge2_precision": 0.09195911321024938, "rouge2_precision_stderr": 0.0011699833619322583, "rouge2_recall": 0.1775307806897462, "rouge2_recall_stderr": 0.0019253867179723713, "rougeL_fmeasure": 0.24043738918976132, "rougeL_fmeasure_stderr": 0.0013185800390045094, "rougeL_precision": 0.19005122950831685, "rougeL_precision_stderr": 0.0013704004562845444, "rougeL_recall": 0.3654678352459831, "rougeL_recall_stderr": 0.002287478344080729, "rougeLsum_fmeasure": 0.234859781190313, "rougeLsum_fmeasure_stderr": 0.0016660792972987284, "rougeLsum_precision": 0.1871664733954694, "rougeLsum_precision_stderr": 0.0016795998819175034, "rougeLsum_recall": 0.3531518958002849, "rougeLsum_recall_stderr": 0.002549612816092206}}, "2": {"generate_text_restaurant": {"bleu": 5.0478792857544, "bleu_stderr": 0.0955947928915754, "rouge1_fmeasure": 0.29410346427248507, "rouge1_fmeasure_stderr": 0.0016623975256268887, "rouge1_precision": 0.2241050828791601, "rouge1_precision_stderr": 0.0014034728745380678, "rouge1_recall": 0.45236888449873663, "rouge1_recall_stderr": 0.0026715518098690376, "rouge2_fmeasure": 0.12443155899982682, "rouge2_fmeasure_stderr": 0.0011772133733908288, "rouge2_precision": 0.09386197076280443, "rouge2_precision_stderr": 0.0009148926844506597, "rouge2_recall": 0.19659067571129185, "rouge2_recall_stderr": 0.002002402707328385, "rougeL_fmeasure": 0.23934584766173644, "rougeL_fmeasure_stderr": 0.001359615817058388, "rougeL_precision": 0.1818158854357878, "rougeL_precision_stderr": 0.0011128905680916296, "rougeL_recall": 0.37081835945909475, "rougeL_recall_stderr": 0.002375611024115413, "rougeLsum_fmeasure": 0.23736656274040938, "rougeLsum_fmeasure_stderr": 0.0016040431016784457, "rougeLsum_precision": 0.18088762184661167, "rougeLsum_precision_stderr": 0.0013194689839081237, "rougeLsum_recall": 0.3653062516147518, "rougeLsum_recall_stderr": 0.002573870006572897}}, "3": {"generate_text_restaurant": {"bleu": 5.087672258313805, "bleu_stderr": 0.06064658943556918, "rouge1_fmeasure": 0.2935313367354299, "rouge1_fmeasure_stderr": 0.0016440289497582862, "rouge1_precision": 0.22575053010652318, "rouge1_precision_stderr": 0.0014412384135060164, "rouge1_recall": 0.4451731054178249, "rouge1_recall_stderr": 0.0026367476311682553, "rouge2_fmeasure": 0.12750456068786548, "rouge2_fmeasure_stderr": 0.0011665340513659086, "rouge2_precision": 0.09736656166579481, "rouge2_precision_stderr": 0.0009571903413424144, "rouge2_recall": 0.19789987241940404, "rouge2_recall_stderr": 0.001943291572471298, "rougeL_fmeasure": 0.23171342431925485, "rougeL_fmeasure_stderr": 0.0013820651159774868, "rougeL_precision": 0.17757595674840093, "rougeL_precision_stderr": 0.0011597800257508988, "rougeL_recall": 0.35437168445799366, "rougeL_recall_stderr": 0.002406937240308316, "rougeLsum_fmeasure": 0.2369908664432372, "rougeLsum_fmeasure_stderr": 0.0015921537433089966, "rougeLsum_precision": 0.18230511282294262, "rougeLsum_precision_stderr": 0.0013480202004844649, "rougeLsum_recall": 0.35950273744023503, "rougeLsum_recall_stderr": 0.0025427736589814804}}, "4": {"generate_text_restaurant": {"bleu": 4.949082794997751, "bleu_stderr": 0.055429177016200946, "rouge1_fmeasure": 0.2910921816482019, "rouge1_fmeasure_stderr": 0.0015885775216964056, "rouge1_precision": 0.22452930915653382, "rouge1_precision_stderr": 0.0013706284547945777, "rouge1_recall": 0.4378733222042296, "rouge1_recall_stderr": 0.002564401138957007, "rouge2_fmeasure": 0.12794767322315437, "rouge2_fmeasure_stderr": 0.0011202406449983555, "rouge2_precision": 0.09792216465445121, "rouge2_precision_stderr": 0.0009076455701348056, "rouge2_recall": 0.19696623942747024, "rouge2_recall_stderr": 0.0018515269491723755, "rougeL_fmeasure": 0.2233868566066242, "rougeL_fmeasure_stderr": 0.0013796353168918603, "rougeL_precision": 0.1717142971339339, "rougeL_precision_stderr": 0.001133416491648434, "rougeL_recall": 0.3387766905491376, "rougeL_recall_stderr": 0.0023817249246171233, "rougeLsum_fmeasure": 0.23519053811366097, "rougeLsum_fmeasure_stderr": 0.0015567112343627231, "rougeLsum_precision": 0.18151858683664857, "rougeLsum_precision_stderr": 0.0013083375652673337, "rougeLsum_recall": 0.3536807254912674, "rougeLsum_recall_stderr": 0.0024776655790481254}}, "5": {"generate_text_restaurant": {"bleu": 4.92314625019521, "bleu_stderr": 0.07551601164459402, "rouge1_fmeasure": 0.29047511185661623, "rouge1_fmeasure_stderr": 0.0015747872767095687, "rouge1_precision": 0.22479618315617036, "rouge1_precision_stderr": 0.0013868896093755887, "rouge1_recall": 0.43452708471524737, "rouge1_recall_stderr": 0.0024930182279991387, "rouge2_fmeasure": 0.12869439521024578, "rouge2_fmeasure_stderr": 0.0011189877822356184, "rouge2_precision": 0.09897663818887681, "rouge2_precision_stderr": 0.0009251066948960677, "rouge2_recall": 0.19639017574031262, "rouge2_recall_stderr": 0.0018025343448513833, "rougeL_fmeasure": 0.2205382115238159, "rougeL_fmeasure_stderr": 0.0013626250956279787, "rougeL_precision": 0.17004099574115009, "rougeL_precision_stderr": 0.001134684990854792, "rougeL_recall": 0.3326629480123629, "rougeL_recall_stderr": 0.002305812494786618, "rougeLsum_fmeasure": 0.234701143013948, "rougeLsum_fmeasure_stderr": 0.0015535330109712124, "rougeLsum_precision": 0.1818515753539541, "rougeLsum_precision_stderr": 0.0013303745113331451, "rougeLsum_recall": 0.35055241632066986, "rougeLsum_recall_stderr": 0.0023989305439091613}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8132047971863825, "bleu_stderr": 0.10507639003675621, "rouge1_fmeasure": 0.2052611768876611, "rouge1_fmeasure_stderr": 0.0025369090518138323, "rouge1_precision": 0.15271559337599452, "rouge1_precision_stderr": 0.0021790062266520633, "rouge1_recall": 0.34254054968490133, "rouge1_recall_stderr": 0.004260396408544156, "rouge2_fmeasure": 0.04580058504146635, "rouge2_fmeasure_stderr": 0.0015583118832614255, "rouge2_precision": 0.03352761852148264, "rouge2_precision_stderr": 0.0011802779612157131, "rouge2_recall": 0.07896585343651522, "rouge2_recall_stderr": 0.0027517843135959367, "rougeL_fmeasure": 0.15443611414487646, "rougeL_fmeasure_stderr": 0.0019292682405073226, "rougeL_precision": 0.1146736926943474, "rougeL_precision_stderr": 0.0016456187515090986, "rougeL_recall": 0.259355492478393, "rougeL_recall_stderr": 0.0033575304867556224, "rougeLsum_fmeasure": 0.1623209095287298, "rougeLsum_fmeasure_stderr": 0.002153521745163498, "rougeLsum_precision": 0.12033937949221436, "rougeLsum_precision_stderr": 0.001777118280398284, "rougeLsum_recall": 0.27267781274491293, "rougeLsum_recall_stderr": 0.0037426738081733297}}, "1": {"article_DOC_summary": {"bleu": 1.3073756617096712, "bleu_stderr": 0.052782982954130135, "rouge1_fmeasure": 0.17449967820288784, "rouge1_fmeasure_stderr": 0.002369336423066145, "rouge1_precision": 0.12381251721575101, "rouge1_precision_stderr": 0.001760120088594068, "rouge1_recall": 0.30858678186952637, "rouge1_recall_stderr": 0.004149119002014553, "rouge2_fmeasure": 0.03429401547311063, "rouge2_fmeasure_stderr": 0.001321638473797611, "rouge2_precision": 0.02413343941691776, "rouge2_precision_stderr": 0.0009341048823033872, "rouge2_recall": 0.06223421415253265, "rouge2_recall_stderr": 0.002465205032601517, "rougeL_fmeasure": 0.13534497645134072, "rougeL_fmeasure_stderr": 0.001763536683059507, "rougeL_precision": 0.095897554139256, "rougeL_precision_stderr": 0.0013059048049442708, "rougeL_recall": 0.24070643244958356, "rougeL_recall_stderr": 0.003216790067169807, "rougeLsum_fmeasure": 0.1391671752855576, "rougeLsum_fmeasure_stderr": 0.0019150506559230587, "rougeLsum_precision": 0.09857402758907566, "rougeLsum_precision_stderr": 0.001409564152632999, "rougeLsum_recall": 0.24762449519359184, "rougeLsum_recall_stderr": 0.003485361562548633}}, "2": {"article_DOC_summary": {"bleu": 1.274617589181825, "bleu_stderr": 0.06988897960629979, "rouge1_fmeasure": 0.16706303415291662, "rouge1_fmeasure_stderr": 0.0023836505487089613, "rouge1_precision": 0.11868867296005936, "rouge1_precision_stderr": 0.001779378842023062, "rouge1_recall": 0.2939312480067265, "rouge1_recall_stderr": 0.004034498400782806, "rouge2_fmeasure": 0.03196269246537769, "rouge2_fmeasure_stderr": 0.0012778809501463474, "rouge2_precision": 0.02254891407181736, "rouge2_precision_stderr": 0.0009107954579101655, "rouge2_recall": 0.05721825746590915, "rouge2_recall_stderr": 0.0023122884359895865, "rougeL_fmeasure": 0.13284294676915245, "rougeL_fmeasure_stderr": 0.0018011533083606107, "rougeL_precision": 0.09420170009730003, "rougeL_precision_stderr": 0.0013350153387053478, "rougeL_recall": 0.23498244366425464, "rougeL_recall_stderr": 0.0031541621527399347, "rougeLsum_fmeasure": 0.13290293761875338, "rougeLsum_fmeasure_stderr": 0.0019240849725477038, "rougeLsum_precision": 0.09420463075665499, "rougeLsum_precision_stderr": 0.0014200580556147809, "rougeLsum_recall": 0.23525781909094762, "rougeLsum_recall_stderr": 0.003358191759919886}}, "3": {"article_DOC_summary": {"bleu": 1.2652546093060948, "bleu_stderr": 0.1102233610719106, "rouge1_fmeasure": 0.16035644841923463, "rouge1_fmeasure_stderr": 0.002432805663341634, "rouge1_precision": 0.1164306972264782, "rouge1_precision_stderr": 0.0018969898420005478, "rouge1_recall": 0.2773994871952473, "rouge1_recall_stderr": 0.004203625414077099, "rouge2_fmeasure": 0.03073956011709992, "rouge2_fmeasure_stderr": 0.0012903967752292147, "rouge2_precision": 0.02199215206024388, "rouge2_precision_stderr": 0.000927475402673862, "rouge2_recall": 0.05451419978518107, "rouge2_recall_stderr": 0.002364264289540265, "rougeL_fmeasure": 0.1289535547192244, "rougeL_fmeasure_stderr": 0.0019271136239755735, "rougeL_precision": 0.09355033719652096, "rougeL_precision_stderr": 0.0015013832879212095, "rougeL_recall": 0.22386364233324585, "rougeL_recall_stderr": 0.003395665797756175, "rougeLsum_fmeasure": 0.12801546135064906, "rougeLsum_fmeasure_stderr": 0.002005640936353301, "rougeLsum_precision": 0.0926981955017463, "rougeLsum_precision_stderr": 0.0015440205637130833, "rougeLsum_recall": 0.22295778433165891, "rougeLsum_recall_stderr": 0.003570721980607728}}, "4": {"article_DOC_summary": {"bleu": 0.666497067867732, "bleu_stderr": 0.1281490153774834, "rouge1_fmeasure": 0.04558344652586532, "rouge1_fmeasure_stderr": 0.0025823104676261617, "rouge1_precision": 0.038572982619784625, "rouge1_precision_stderr": 0.0024729249944330568, "rouge1_recall": 0.07107678930090279, "rouge1_recall_stderr": 0.004076533013525894, "rouge2_fmeasure": 0.008632866225101583, "rouge2_fmeasure_stderr": 0.0008761371831761245, "rouge2_precision": 0.007545150209955235, "rouge2_precision_stderr": 0.0011189841120498008, "rouge2_recall": 0.013869137016367862, "rouge2_recall_stderr": 0.0014296163987011384, "rougeL_fmeasure": 0.036532845494538814, "rougeL_fmeasure_stderr": 0.0020427276762614533, "rougeL_precision": 0.031142070143318712, "rougeL_precision_stderr": 0.0020375938036984393, "rougeL_recall": 0.05726286831513433, "rougeL_recall_stderr": 0.003284515273864642, "rougeLsum_fmeasure": 0.03702420791798951, "rougeLsum_fmeasure_stderr": 0.002104823034799153, "rougeLsum_precision": 0.03157558833164938, "rougeLsum_precision_stderr": 0.002083883314562155, "rougeLsum_recall": 0.05812328559471168, "rougeLsum_recall_stderr": 0.003395019535792967}}, "5": {"article_DOC_summary": {"bleu": 1.3049274959071692e-36, "bleu_stderr": 4.8336427940633265e-32, "rouge1_fmeasure": 0.002609066738796257, "rouge1_fmeasure_stderr": 0.0007028208233684982, "rouge1_precision": 0.0030509327825175713, "rouge1_precision_stderr": 0.0008372862596545976, "rouge1_recall": 0.0024067151728067427, "rouge1_recall_stderr": 0.0006481584301909523, "rouge2_fmeasure": 0.0002620545073375262, "rouge2_fmeasure_stderr": 0.0001644609325929078, "rouge2_precision": 0.0003414315983683353, "rouge2_precision_stderr": 0.00021852535649681038, "rouge2_recall": 0.00021535068704880026, "rouge2_recall_stderr": 0.0001343944067785059, "rougeL_fmeasure": 0.0019350603010447056, "rougeL_fmeasure_stderr": 0.0005187618126730011, "rougeL_precision": 0.0022772118710841765, "rougeL_precision_stderr": 0.000629263581126937, "rougeL_recall": 0.0017942177079948414, "rougeL_recall_stderr": 0.0004874043934673781, "rougeLsum_fmeasure": 0.002076852746710041, "rougeLsum_fmeasure_stderr": 0.0005579881804536707, "rougeLsum_precision": 0.0024232484386406926, "rougeLsum_precision_stderr": 0.000665024608270745, "rougeLsum_recall": 0.0019331526416023613, "rougeLsum_recall_stderr": 0.0005262077504507022}}}}
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3977812770960404, "bleu_stderr": 0.042773776393969704, "rouge1_fmeasure": 0.11321310063151303, "rouge1_fmeasure_stderr": 0.0022279121821432222, "rouge1_precision": 0.0769744998872799, "rouge1_precision_stderr": 0.0019295721946577652, "rouge1_recall": 0.3084217053839128, "rouge1_recall_stderr": 0.004775900652467882, "rouge2_fmeasure": 0.05373073643615162, "rouge2_fmeasure_stderr": 0.0014061908002077534, "rouge2_precision": 0.036029038573922645, "rouge2_precision_stderr": 0.0011314658097909978, "rouge2_recall": 0.14884335480613473, "rouge2_recall_stderr": 0.0032559284273627463, "rougeL_fmeasure": 0.10876944861868645, "rougeL_fmeasure_stderr": 0.0020435744759205866, "rougeL_precision": 0.07364934603473708, "rougeL_precision_stderr": 0.0017837348414022293, "rougeL_recall": 0.30006083515709553, "rougeL_recall_stderr": 0.004651155763795223, "rougeLsum_fmeasure": 0.10778741619118525, "rougeLsum_fmeasure_stderr": 0.0020874022925801857, "rougeLsum_precision": 0.07333854015351937, "rougeLsum_precision_stderr": 0.0018346409882236817, "rougeLsum_recall": 0.2940880425231954, "rougeLsum_recall_stderr": 0.00448763966555627}}, "1": {"PALM_prompt": {"bleu": 0.4858884207669602, "bleu_stderr": 0.03947424762561928, "rouge1_fmeasure": 0.11762722694279593, "rouge1_fmeasure_stderr": 0.0020593738501179537, "rouge1_precision": 0.07647665252229767, "rouge1_precision_stderr": 0.0016779106301236663, "rouge1_recall": 0.3687074301312142, "rouge1_recall_stderr": 0.005245823782864971, "rouge2_fmeasure": 0.053953733277074825, "rouge2_fmeasure_stderr": 0.0013020349776540338, "rouge2_precision": 0.03542714618000757, "rouge2_precision_stderr": 0.0011555879738579082, "rouge2_recall": 0.17376998543157893, "rouge2_recall_stderr": 0.0035488947049867065, "rougeL_fmeasure": 0.10988057843173323, "rougeL_fmeasure_stderr": 0.0018200534163827285, "rougeL_precision": 0.07128439719009544, "rougeL_precision_stderr": 0.0014868314257995955, "rougeL_recall": 0.34595406334624373, "rougeL_recall_stderr": 0.004831985611026802, "rougeLsum_fmeasure": 0.11185076879917913, "rougeLsum_fmeasure_stderr": 0.0019314277717434314, "rougeLsum_precision": 0.07274032147241147, "rougeLsum_precision_stderr": 0.001570288324310693, "rougeLsum_recall": 0.34905727660039526, "rougeLsum_recall_stderr": 0.004820236454514781}}, "2": {"PALM_prompt": {"bleu": 0.4865401467524791, "bleu_stderr": 0.03301489219343109, "rouge1_fmeasure": 0.12140690139263367, "rouge1_fmeasure_stderr": 0.0020934945566479517, "rouge1_precision": 0.0785323602353207, "rouge1_precision_stderr": 0.0016674769473547485, "rouge1_recall": 0.3826963273434018, "rouge1_recall_stderr": 0.005091498719758241, "rouge2_fmeasure": 0.055938501058548445, "rouge2_fmeasure_stderr": 0.0013133256141269673, "rouge2_precision": 0.036117043681572876, "rouge2_precision_stderr": 0.0010074919212131558, "rouge2_recall": 0.1843419679127612, "rouge2_recall_stderr": 0.0036082743272132984, "rougeL_fmeasure": 0.11283636862911874, "rougeL_fmeasure_stderr": 0.001838687943060424, "rougeL_precision": 0.07278295742000836, "rougeL_precision_stderr": 0.001448738396184804, "rougeL_recall": 0.35684111429066556, "rougeL_recall_stderr": 0.004639509265144684, "rougeLsum_fmeasure": 0.11499262973438232, "rougeLsum_fmeasure_stderr": 0.001943551458826228, "rougeLsum_precision": 0.07434708129903707, "rougeLsum_precision_stderr": 0.0015324213206510558, "rougeLsum_recall": 0.3616139497567665, "rougeLsum_recall_stderr": 0.00472174497997542}}, "3": {"PALM_prompt": {"bleu": 0.4839284785438761, "bleu_stderr": 0.03327890595001754, "rouge1_fmeasure": 0.12242477012284851, "rouge1_fmeasure_stderr": 0.0020131075559474514, "rouge1_precision": 0.0793752929679806, "rouge1_precision_stderr": 0.0016441162972096251, "rouge1_recall": 0.3900658520481954, "rouge1_recall_stderr": 0.005132308563135307, "rouge2_fmeasure": 0.05687543071165634, "rouge2_fmeasure_stderr": 0.0013202007845555955, "rouge2_precision": 0.03690637170749387, "rouge2_precision_stderr": 0.0010764450805284798, "rouge2_recall": 0.18958917794351504, "rouge2_recall_stderr": 0.003657962608393784, "rougeL_fmeasure": 0.11435077399921943, "rougeL_fmeasure_stderr": 0.0017945311612407471, "rougeL_precision": 0.0738949860941832, "rougeL_precision_stderr": 0.0014436209066605562, "rougeL_recall": 0.3648598934397, "rougeL_recall_stderr": 0.00471187223794295, "rougeLsum_fmeasure": 0.11613553078433414, "rougeLsum_fmeasure_stderr": 0.0018941056856690433, "rougeLsum_precision": 0.07524201527148952, "rougeLsum_precision_stderr": 0.0015293036433053158, "rougeLsum_recall": 0.36886308727557704, "rougeLsum_recall_stderr": 0.004760348279415875}}, "4": {"PALM_prompt": {"bleu": 0.5548270479926116, "bleu_stderr": 0.03480896456441664, "rouge1_fmeasure": 0.12416734800976433, "rouge1_fmeasure_stderr": 0.002017036697995381, "rouge1_precision": 0.08089421581764959, "rouge1_precision_stderr": 0.0016865963323483071, "rouge1_recall": 0.38947484896470447, "rouge1_recall_stderr": 0.004981758956403416, "rouge2_fmeasure": 0.05727893718775754, "rouge2_fmeasure_stderr": 0.0012728110625899367, "rouge2_precision": 0.03685145934733897, "rouge2_precision_stderr": 0.0009781322799148689, "rouge2_recall": 0.1920800837935358, "rouge2_recall_stderr": 0.003704050879158052, "rougeL_fmeasure": 0.11498838903650449, "rougeL_fmeasure_stderr": 0.0017679546975187008, "rougeL_precision": 0.074624615413753, "rougeL_precision_stderr": 0.0014406232763323073, "rougeL_recall": 0.3618908595651899, "rougeL_recall_stderr": 0.004544473420839658, "rougeLsum_fmeasure": 0.11684934686383387, "rougeLsum_fmeasure_stderr": 0.0018442583870645086, "rougeLsum_precision": 0.07596400780735503, "rougeLsum_precision_stderr": 0.0015019862620606643, "rougeLsum_recall": 0.3668928371056906, "rougeLsum_recall_stderr": 0.004598554972939322}}, "5": {"PALM_prompt": {"bleu": 0.5798645713024126, "bleu_stderr": 0.043593199687327125, "rouge1_fmeasure": 0.13094745646721262, "rouge1_fmeasure_stderr": 0.0021851947780450946, "rouge1_precision": 0.08676390071302464, "rouge1_precision_stderr": 0.0020216057971965164, "rouge1_recall": 0.402597727832581, "rouge1_recall_stderr": 0.004978758492596992, "rouge2_fmeasure": 0.061290902793478194, "rouge2_fmeasure_stderr": 0.0014102409525092647, "rouge2_precision": 0.04103268346512717, "rouge2_precision_stderr": 0.0014088036590616635, "rouge2_recall": 0.1996309916287194, "rouge2_recall_stderr": 0.003707480656263498, "rougeL_fmeasure": 0.11998668588666234, "rougeL_fmeasure_stderr": 0.0018836137894341057, "rougeL_precision": 0.07926455081332483, "rougeL_precision_stderr": 0.001780734719453499, "rougeL_recall": 0.3718295970280543, "rougeL_recall_stderr": 0.004501635765041348, "rougeLsum_fmeasure": 0.12286536224775627, "rougeLsum_fmeasure_stderr": 0.00198637085681598, "rougeLsum_precision": 0.08136215990275024, "rougeLsum_precision_stderr": 0.0018569641285268036, "rougeLsum_recall": 0.3786252468628803, "rougeLsum_recall_stderr": 0.0045841623740715725}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.3265737143060392, "bleu_stderr": 0.05144732865628208, "rouge1_fmeasure": 0.17044572730648816, "rouge1_fmeasure_stderr": 0.0017736206602746725, "rouge1_precision": 0.1455553771831484, "rouge1_precision_stderr": 0.001822183469048219, "rouge1_recall": 0.24842139858721166, "rouge1_recall_stderr": 0.002581896458749269, "rouge2_fmeasure": 0.03224922808684278, "rouge2_fmeasure_stderr": 0.0007992587057440431, "rouge2_precision": 0.02746904348907652, "rouge2_precision_stderr": 0.000734949838478965, "rouge2_recall": 0.049293404741380026, "rouge2_recall_stderr": 0.0014019141580128261, "rougeL_fmeasure": 0.1361714573185675, "rougeL_fmeasure_stderr": 0.0012895027270174906, "rougeL_precision": 0.1149161071050528, "rougeL_precision_stderr": 0.001304785756648923, "rougeL_recall": 0.20346388626158698, "rougeL_recall_stderr": 0.0021419836616257755, "rougeLsum_fmeasure": 0.1565676013933749, "rougeLsum_fmeasure_stderr": 0.001602387883565948, "rougeLsum_precision": 0.1334840411046874, "rougeLsum_precision_stderr": 0.0016521020390246263, "rougeLsum_recall": 0.2294411836470692, "rougeLsum_recall_stderr": 0.0023999510628294886}}, "1": {"tldr_en": {"bleu": 2.246279205589283, "bleu_stderr": 0.08591459030255512, "rouge1_fmeasure": 0.1935542802699907, "rouge1_fmeasure_stderr": 0.001958003782927919, "rouge1_precision": 0.1751445150209056, "rouge1_precision_stderr": 0.0023551193979363416, "rouge1_recall": 0.2732696965442855, "rouge1_recall_stderr": 0.0027342348888761856, "rouge2_fmeasure": 0.0450115763618159, "rouge2_fmeasure_stderr": 0.0009707150689037455, "rouge2_precision": 0.04234533338241678, "rouge2_precision_stderr": 0.0012533321285062282, "rouge2_recall": 0.06525005846041558, "rouge2_recall_stderr": 0.001538887481538743, "rougeL_fmeasure": 0.14672861716843905, "rougeL_fmeasure_stderr": 0.0013770639859191664, "rougeL_precision": 0.13216089943364137, "rougeL_precision_stderr": 0.001774044309331969, "rougeL_recall": 0.21158218924860497, "rougeL_recall_stderr": 0.0021608610753667597, "rougeLsum_fmeasure": 0.18090547445662525, "rougeLsum_fmeasure_stderr": 0.0018247563264741187, "rougeLsum_precision": 0.16344934405028377, "rougeLsum_precision_stderr": 0.0022011880764864523, "rougeLsum_recall": 0.25625768066865473, "rougeLsum_recall_stderr": 0.0025841726330260587}}, "2": {"tldr_en": {"bleu": 2.345475565488728, "bleu_stderr": 0.0891681552482958, "rouge1_fmeasure": 0.19781195265437124, "rouge1_fmeasure_stderr": 0.0019018498965456766, "rouge1_precision": 0.20476433639887376, "rouge1_precision_stderr": 0.0030287812733787453, "rouge1_recall": 0.2652236074559944, "rouge1_recall_stderr": 0.002665429988358408, "rouge2_fmeasure": 0.046991077091615464, "rouge2_fmeasure_stderr": 0.001020568124375834, "rouge2_precision": 0.053347046649297854, "rouge2_precision_stderr": 0.001742234751075911, "rouge2_recall": 0.06360050869115526, "rouge2_recall_stderr": 0.0014902098542860026, "rougeL_fmeasure": 0.14931959492711866, "rougeL_fmeasure_stderr": 0.0013846083905750103, "rougeL_precision": 0.15578866391543178, "rougeL_precision_stderr": 0.0024507775797221014, "rougeL_recall": 0.20352525777938035, "rougeL_recall_stderr": 0.002128220090594726, "rougeLsum_fmeasure": 0.1859254356744225, "rougeLsum_fmeasure_stderr": 0.0017890566279122754, "rougeLsum_precision": 0.1923890093270234, "rougeLsum_precision_stderr": 0.002878516547196234, "rougeLsum_recall": 0.2501743397926172, "rougeLsum_recall_stderr": 0.002542023706010739}}, "3": {"tldr_en": {"bleu": 2.486551199486084, "bleu_stderr": 0.10733197644139546, "rouge1_fmeasure": 0.16511142540923185, "rouge1_fmeasure_stderr": 0.002230079701419594, "rouge1_precision": 0.19042957427828894, "rouge1_precision_stderr": 0.0034896963183491847, "rouge1_recall": 0.21529366744218953, "rouge1_recall_stderr": 0.003129813215042198, "rouge2_fmeasure": 0.041415038125264154, "rouge2_fmeasure_stderr": 0.0010392117723075948, "rouge2_precision": 0.05235448710715267, "rouge2_precision_stderr": 0.0019272746242035023, "rouge2_recall": 0.05472437719732025, "rouge2_recall_stderr": 0.0014918349514131508, "rougeL_fmeasure": 0.12591124148167096, "rougeL_fmeasure_stderr": 0.001652558982701636, "rougeL_precision": 0.14847432851778467, "rougeL_precision_stderr": 0.002908138351035923, "rougeL_recall": 0.16619517405565673, "rougeL_recall_stderr": 0.0024735526294640425, "rougeLsum_fmeasure": 0.1554595148861975, "rougeLsum_fmeasure_stderr": 0.0021008206373109716, "rougeLsum_precision": 0.1799429391587834, "rougeLsum_precision_stderr": 0.0033466688917255442, "rougeLsum_recall": 0.20294978970613614, "rougeLsum_recall_stderr": 0.0029704284250721296}}, "4": {"tldr_en": {"bleu": 0.3875462683969639, "bleu_stderr": 0.03245177019496807, "rouge1_fmeasure": 0.052525463500647365, "rouge1_fmeasure_stderr": 0.0018744026243002676, "rouge1_precision": 0.06440626532146364, "rouge1_precision_stderr": 0.00268665279066644, "rouge1_recall": 0.06922307016493702, "rouge1_recall_stderr": 0.0025719141943646023, "rouge2_fmeasure": 0.013029546490366375, "rouge2_fmeasure_stderr": 0.0007288446148165251, "rouge2_precision": 0.0178349647962883, "rouge2_precision_stderr": 0.0013452761441912161, "rouge2_recall": 0.017580774128720993, "rouge2_recall_stderr": 0.0010304717841861568, "rougeL_fmeasure": 0.04095377056897623, "rougeL_fmeasure_stderr": 0.0014503199521407482, "rougeL_precision": 0.05164308441211941, "rougeL_precision_stderr": 0.0022481713778620574, "rougeL_recall": 0.054211222609477736, "rougeL_recall_stderr": 0.002027978675808448, "rougeLsum_fmeasure": 0.04905113460379449, "rougeLsum_fmeasure_stderr": 0.001753375793716932, "rougeLsum_precision": 0.06062658484711723, "rougeLsum_precision_stderr": 0.0025652835059215163, "rougeLsum_recall": 0.06475725537355295, "rougeLsum_recall_stderr": 0.0024200280271889506}}, "5": {"tldr_en": {"bleu": 1.3079122393460444e-07, "bleu_stderr": 1.864662528804058e-07, "rouge1_fmeasure": 0.008076407211393497, "rouge1_fmeasure_stderr": 0.0007760566084133647, "rouge1_precision": 0.010047067438597618, "rouge1_precision_stderr": 0.001137359639935089, "rouge1_recall": 0.010816326440483082, "rouge1_recall_stderr": 0.0011039517232321794, "rouge2_fmeasure": 0.0019206264691365757, "rouge2_fmeasure_stderr": 0.000258647865221635, "rouge2_precision": 0.0027477731760646757, "rouge2_precision_stderr": 0.0005096072048009003, "rouge2_recall": 0.002878573046036408, "rouge2_recall_stderr": 0.000495653003335593, "rougeL_fmeasure": 0.006338539028502161, "rougeL_fmeasure_stderr": 0.0006092979027932292, "rougeL_precision": 0.008090184238955125, "rougeL_precision_stderr": 0.0009625945746132565, "rougeL_recall": 0.00864992417766312, "rougeL_recall_stderr": 0.0009068597058964701, "rougeLsum_fmeasure": 0.007638907375907553, "rougeLsum_fmeasure_stderr": 0.0007393171200102091, "rougeLsum_precision": 0.009552142874268494, "rougeLsum_precision_stderr": 0.001093528882202685, "rougeLsum_recall": 0.010161043868842281, "rougeLsum_recall_stderr": 0.0010388252762865194}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.8316383531304311, "bleu_stderr": 0.07290512983250645, "rouge1_fmeasure": 0.12401829493831736, "rouge1_fmeasure_stderr": 0.0013356050583825897, "rouge1_precision": 0.17215029205196647, "rouge1_precision_stderr": 0.003188745876763668, "rouge1_recall": 0.14132270074178124, "rouge1_recall_stderr": 0.0014347807622836861, "rouge2_fmeasure": 0.013839592252368907, "rouge2_fmeasure_stderr": 0.0006744998388324607, "rouge2_precision": 0.02685112433131068, "rouge2_precision_stderr": 0.002069788903045141, "rouge2_recall": 0.01555067230449375, "rouge2_recall_stderr": 0.0007871809267807966, "rougeL_fmeasure": 0.1165625741372221, "rougeL_fmeasure_stderr": 0.0011694039904290299, "rougeL_precision": 0.15898310735132792, "rougeL_precision_stderr": 0.0029356787079883426, "rougeL_recall": 0.13514593742293743, "rougeL_recall_stderr": 0.001334864171571462, "rougeLsum_fmeasure": 0.10325706305255843, "rougeLsum_fmeasure_stderr": 0.0012798538152521957, "rougeLsum_precision": 0.15028738084951368, "rougeLsum_precision_stderr": 0.003111744841937638, "rougeLsum_recall": 0.11499752781885914, "rougeLsum_recall_stderr": 0.001289170268567555}}, "1": {"generate_text_restaurant": {"bleu": 4.78157811483512, "bleu_stderr": 0.06272692163269489, "rouge1_fmeasure": 0.29266072307096785, "rouge1_fmeasure_stderr": 0.0017871825084829352, "rouge1_precision": 0.23312078822753843, "rouge1_precision_stderr": 0.0018885735159246298, "rouge1_recall": 0.4396892491698765, "rouge1_recall_stderr": 0.002717067470795004, "rouge2_fmeasure": 0.11567035973458402, "rouge2_fmeasure_stderr": 0.0012191850944871271, "rouge2_precision": 0.09195911321024938, "rouge2_precision_stderr": 0.0011699833619322583, "rouge2_recall": 0.1775307806897462, "rouge2_recall_stderr": 0.0019253867179723713, "rougeL_fmeasure": 0.24043738918976132, "rougeL_fmeasure_stderr": 0.0013185800390045094, "rougeL_precision": 0.19005122950831685, "rougeL_precision_stderr": 0.0013704004562845444, "rougeL_recall": 0.3654678352459831, "rougeL_recall_stderr": 0.002287478344080729, "rougeLsum_fmeasure": 0.234859781190313, "rougeLsum_fmeasure_stderr": 0.0016660792972987284, "rougeLsum_precision": 0.1871664733954694, "rougeLsum_precision_stderr": 0.0016795998819175034, "rougeLsum_recall": 0.3531518958002849, "rougeLsum_recall_stderr": 0.002549612816092206}}, "2": {"generate_text_restaurant": {"bleu": 5.0478792857544, "bleu_stderr": 0.0955947928915754, "rouge1_fmeasure": 0.29410346427248507, "rouge1_fmeasure_stderr": 0.0016623975256268887, "rouge1_precision": 0.2241050828791601, "rouge1_precision_stderr": 0.0014034728745380678, "rouge1_recall": 0.45236888449873663, "rouge1_recall_stderr": 0.0026715518098690376, "rouge2_fmeasure": 0.12443155899982682, "rouge2_fmeasure_stderr": 0.0011772133733908288, "rouge2_precision": 0.09386197076280443, "rouge2_precision_stderr": 0.0009148926844506597, "rouge2_recall": 0.19659067571129185, "rouge2_recall_stderr": 0.002002402707328385, "rougeL_fmeasure": 0.23934584766173644, "rougeL_fmeasure_stderr": 0.001359615817058388, "rougeL_precision": 0.1818158854357878, "rougeL_precision_stderr": 0.0011128905680916296, "rougeL_recall": 0.37081835945909475, "rougeL_recall_stderr": 0.002375611024115413, "rougeLsum_fmeasure": 0.23736656274040938, "rougeLsum_fmeasure_stderr": 0.0016040431016784457, "rougeLsum_precision": 0.18088762184661167, "rougeLsum_precision_stderr": 0.0013194689839081237, "rougeLsum_recall": 0.3653062516147518, "rougeLsum_recall_stderr": 0.002573870006572897}}, "3": {"generate_text_restaurant": {"bleu": 5.087672258313805, "bleu_stderr": 0.06064658943556918, "rouge1_fmeasure": 0.2935313367354299, "rouge1_fmeasure_stderr": 0.0016440289497582862, "rouge1_precision": 0.22575053010652318, "rouge1_precision_stderr": 0.0014412384135060164, "rouge1_recall": 0.4451731054178249, "rouge1_recall_stderr": 0.0026367476311682553, "rouge2_fmeasure": 0.12750456068786548, "rouge2_fmeasure_stderr": 0.0011665340513659086, "rouge2_precision": 0.09736656166579481, "rouge2_precision_stderr": 0.0009571903413424144, "rouge2_recall": 0.19789987241940404, "rouge2_recall_stderr": 0.001943291572471298, "rougeL_fmeasure": 0.23171342431925485, "rougeL_fmeasure_stderr": 0.0013820651159774868, "rougeL_precision": 0.17757595674840093, "rougeL_precision_stderr": 0.0011597800257508988, "rougeL_recall": 0.35437168445799366, "rougeL_recall_stderr": 0.002406937240308316, "rougeLsum_fmeasure": 0.2369908664432372, "rougeLsum_fmeasure_stderr": 0.0015921537433089966, "rougeLsum_precision": 0.18230511282294262, "rougeLsum_precision_stderr": 0.0013480202004844649, "rougeLsum_recall": 0.35950273744023503, "rougeLsum_recall_stderr": 0.0025427736589814804}}, "4": {"generate_text_restaurant": {"bleu": 4.949082794997751, "bleu_stderr": 0.055429177016200946, "rouge1_fmeasure": 0.2910921816482019, "rouge1_fmeasure_stderr": 0.0015885775216964056, "rouge1_precision": 0.22452930915653382, "rouge1_precision_stderr": 0.0013706284547945777, "rouge1_recall": 0.4378733222042296, "rouge1_recall_stderr": 0.002564401138957007, "rouge2_fmeasure": 0.12794767322315437, "rouge2_fmeasure_stderr": 0.0011202406449983555, "rouge2_precision": 0.09792216465445121, "rouge2_precision_stderr": 0.0009076455701348056, "rouge2_recall": 0.19696623942747024, "rouge2_recall_stderr": 0.0018515269491723755, "rougeL_fmeasure": 0.2233868566066242, "rougeL_fmeasure_stderr": 0.0013796353168918603, "rougeL_precision": 0.1717142971339339, "rougeL_precision_stderr": 0.001133416491648434, "rougeL_recall": 0.3387766905491376, "rougeL_recall_stderr": 0.0023817249246171233, "rougeLsum_fmeasure": 0.23519053811366097, "rougeLsum_fmeasure_stderr": 0.0015567112343627231, "rougeLsum_precision": 0.18151858683664857, "rougeLsum_precision_stderr": 0.0013083375652673337, "rougeLsum_recall": 0.3536807254912674, "rougeLsum_recall_stderr": 0.0024776655790481254}}, "5": {"generate_text_restaurant": {"bleu": 4.92314625019521, "bleu_stderr": 0.07551601164459402, "rouge1_fmeasure": 0.29047511185661623, "rouge1_fmeasure_stderr": 0.0015747872767095687, "rouge1_precision": 0.22479618315617036, "rouge1_precision_stderr": 0.0013868896093755887, "rouge1_recall": 0.43452708471524737, "rouge1_recall_stderr": 0.0024930182279991387, "rouge2_fmeasure": 0.12869439521024578, "rouge2_fmeasure_stderr": 0.0011189877822356184, "rouge2_precision": 0.09897663818887681, "rouge2_precision_stderr": 0.0009251066948960677, "rouge2_recall": 0.19639017574031262, "rouge2_recall_stderr": 0.0018025343448513833, "rougeL_fmeasure": 0.2205382115238159, "rougeL_fmeasure_stderr": 0.0013626250956279787, "rougeL_precision": 0.17004099574115009, "rougeL_precision_stderr": 0.001134684990854792, "rougeL_recall": 0.3326629480123629, "rougeL_recall_stderr": 0.002305812494786618, "rougeLsum_fmeasure": 0.234701143013948, "rougeLsum_fmeasure_stderr": 0.0015535330109712124, "rougeLsum_precision": 0.1818515753539541, "rougeLsum_precision_stderr": 0.0013303745113331451, "rougeLsum_recall": 0.35055241632066986, "rougeLsum_recall_stderr": 0.0023989305439091613}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8132047971863825, "bleu_stderr": 0.10507639003675621, "rouge1_fmeasure": 0.2052611768876611, "rouge1_fmeasure_stderr": 0.0025369090518138323, "rouge1_precision": 0.15271559337599452, "rouge1_precision_stderr": 0.0021790062266520633, "rouge1_recall": 0.34254054968490133, "rouge1_recall_stderr": 0.004260396408544156, "rouge2_fmeasure": 0.04580058504146635, "rouge2_fmeasure_stderr": 0.0015583118832614255, "rouge2_precision": 0.03352761852148264, "rouge2_precision_stderr": 0.0011802779612157131, "rouge2_recall": 0.07896585343651522, "rouge2_recall_stderr": 0.0027517843135959367, "rougeL_fmeasure": 0.15443611414487646, "rougeL_fmeasure_stderr": 0.0019292682405073226, "rougeL_precision": 0.1146736926943474, "rougeL_precision_stderr": 0.0016456187515090986, "rougeL_recall": 0.259355492478393, "rougeL_recall_stderr": 0.0033575304867556224, "rougeLsum_fmeasure": 0.1623209095287298, "rougeLsum_fmeasure_stderr": 0.002153521745163498, "rougeLsum_precision": 0.12033937949221436, "rougeLsum_precision_stderr": 0.001777118280398284, "rougeLsum_recall": 0.27267781274491293, "rougeLsum_recall_stderr": 0.0037426738081733297}}, "1": {"article_DOC_summary": {"bleu": 1.3073756617096712, "bleu_stderr": 0.052782982954130135, "rouge1_fmeasure": 0.17449967820288784, "rouge1_fmeasure_stderr": 0.002369336423066145, "rouge1_precision": 0.12381251721575101, "rouge1_precision_stderr": 0.001760120088594068, "rouge1_recall": 0.30858678186952637, "rouge1_recall_stderr": 0.004149119002014553, "rouge2_fmeasure": 0.03429401547311063, "rouge2_fmeasure_stderr": 0.001321638473797611, "rouge2_precision": 0.02413343941691776, "rouge2_precision_stderr": 0.0009341048823033872, "rouge2_recall": 0.06223421415253265, "rouge2_recall_stderr": 0.002465205032601517, "rougeL_fmeasure": 0.13534497645134072, "rougeL_fmeasure_stderr": 0.001763536683059507, "rougeL_precision": 0.095897554139256, "rougeL_precision_stderr": 0.0013059048049442708, "rougeL_recall": 0.24070643244958356, "rougeL_recall_stderr": 0.003216790067169807, "rougeLsum_fmeasure": 0.1391671752855576, "rougeLsum_fmeasure_stderr": 0.0019150506559230587, "rougeLsum_precision": 0.09857402758907566, "rougeLsum_precision_stderr": 0.001409564152632999, "rougeLsum_recall": 0.24762449519359184, "rougeLsum_recall_stderr": 0.003485361562548633}}, "2": {"article_DOC_summary": {"bleu": 1.274617589181825, "bleu_stderr": 0.06988897960629979, "rouge1_fmeasure": 0.16706303415291662, "rouge1_fmeasure_stderr": 0.0023836505487089613, "rouge1_precision": 0.11868867296005936, "rouge1_precision_stderr": 0.001779378842023062, "rouge1_recall": 0.2939312480067265, "rouge1_recall_stderr": 0.004034498400782806, "rouge2_fmeasure": 0.03196269246537769, "rouge2_fmeasure_stderr": 0.0012778809501463474, "rouge2_precision": 0.02254891407181736, "rouge2_precision_stderr": 0.0009107954579101655, "rouge2_recall": 0.05721825746590915, "rouge2_recall_stderr": 0.0023122884359895865, "rougeL_fmeasure": 0.13284294676915245, "rougeL_fmeasure_stderr": 0.0018011533083606107, "rougeL_precision": 0.09420170009730003, "rougeL_precision_stderr": 0.0013350153387053478, "rougeL_recall": 0.23498244366425464, "rougeL_recall_stderr": 0.0031541621527399347, "rougeLsum_fmeasure": 0.13290293761875338, "rougeLsum_fmeasure_stderr": 0.0019240849725477038, "rougeLsum_precision": 0.09420463075665499, "rougeLsum_precision_stderr": 0.0014200580556147809, "rougeLsum_recall": 0.23525781909094762, "rougeLsum_recall_stderr": 0.003358191759919886}}, "3": {"article_DOC_summary": {"bleu": 1.2652546093060948, "bleu_stderr": 0.1102233610719106, "rouge1_fmeasure": 0.16035644841923463, "rouge1_fmeasure_stderr": 0.002432805663341634, "rouge1_precision": 0.1164306972264782, "rouge1_precision_stderr": 0.0018969898420005478, "rouge1_recall": 0.2773994871952473, "rouge1_recall_stderr": 0.004203625414077099, "rouge2_fmeasure": 0.03073956011709992, "rouge2_fmeasure_stderr": 0.0012903967752292147, "rouge2_precision": 0.02199215206024388, "rouge2_precision_stderr": 0.000927475402673862, "rouge2_recall": 0.05451419978518107, "rouge2_recall_stderr": 0.002364264289540265, "rougeL_fmeasure": 0.1289535547192244, "rougeL_fmeasure_stderr": 0.0019271136239755735, "rougeL_precision": 0.09355033719652096, "rougeL_precision_stderr": 0.0015013832879212095, "rougeL_recall": 0.22386364233324585, "rougeL_recall_stderr": 0.003395665797756175, "rougeLsum_fmeasure": 0.12801546135064906, "rougeLsum_fmeasure_stderr": 0.002005640936353301, "rougeLsum_precision": 0.0926981955017463, "rougeLsum_precision_stderr": 0.0015440205637130833, "rougeLsum_recall": 0.22295778433165891, "rougeLsum_recall_stderr": 0.003570721980607728}}, "4": {"article_DOC_summary": {"bleu": 0.666497067867732, "bleu_stderr": 0.1281490153774834, "rouge1_fmeasure": 0.04558344652586532, "rouge1_fmeasure_stderr": 0.0025823104676261617, "rouge1_precision": 0.038572982619784625, "rouge1_precision_stderr": 0.0024729249944330568, "rouge1_recall": 0.07107678930090279, "rouge1_recall_stderr": 0.004076533013525894, "rouge2_fmeasure": 0.008632866225101583, "rouge2_fmeasure_stderr": 0.0008761371831761245, "rouge2_precision": 0.007545150209955235, "rouge2_precision_stderr": 0.0011189841120498008, "rouge2_recall": 0.013869137016367862, "rouge2_recall_stderr": 0.0014296163987011384, "rougeL_fmeasure": 0.036532845494538814, "rougeL_fmeasure_stderr": 0.0020427276762614533, "rougeL_precision": 0.031142070143318712, "rougeL_precision_stderr": 0.0020375938036984393, "rougeL_recall": 0.05726286831513433, "rougeL_recall_stderr": 0.003284515273864642, "rougeLsum_fmeasure": 0.03702420791798951, "rougeLsum_fmeasure_stderr": 0.002104823034799153, "rougeLsum_precision": 0.03157558833164938, "rougeLsum_precision_stderr": 0.002083883314562155, "rougeLsum_recall": 0.05812328559471168, "rougeLsum_recall_stderr": 0.003395019535792967}}, "5": {"article_DOC_summary": {"bleu": 1.3049274959071692e-36, "bleu_stderr": 4.8336427940633265e-32, "rouge1_fmeasure": 0.002609066738796257, "rouge1_fmeasure_stderr": 0.0007028208233684982, "rouge1_precision": 0.0030509327825175713, "rouge1_precision_stderr": 0.0008372862596545976, "rouge1_recall": 0.0024067151728067427, "rouge1_recall_stderr": 0.0006481584301909523, "rouge2_fmeasure": 0.0002620545073375262, "rouge2_fmeasure_stderr": 0.0001644609325929078, "rouge2_precision": 0.0003414315983683353, "rouge2_precision_stderr": 0.00021852535649681038, "rouge2_recall": 0.00021535068704880026, "rouge2_recall_stderr": 0.0001343944067785059, "rougeL_fmeasure": 0.0019350603010447056, "rougeL_fmeasure_stderr": 0.0005187618126730011, "rougeL_precision": 0.0022772118710841765, "rougeL_precision_stderr": 0.000629263581126937, "rougeL_recall": 0.0017942177079948414, "rougeL_recall_stderr": 0.0004874043934673781, "rougeLsum_fmeasure": 0.002076852746710041, "rougeLsum_fmeasure_stderr": 0.0005579881804536707, "rougeLsum_precision": 0.0024232484386406926, "rougeLsum_precision_stderr": 0.000665024608270745, "rougeLsum_recall": 0.0019331526416023613, "rougeLsum_recall_stderr": 0.0005262077504507022}}}}
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_0.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.335,0.014933117490932575,0
3
  anli_r2,acc,0.337,0.014955087918653607,0
4
  anli_r3,acc,0.3408333333333333,0.013688600793296939,0
 
 
 
 
 
5
  cb,acc,0.39285714285714285,0.0658538889806635,1
6
  cb,f1,0.1981981981981982,,1
7
  copa,acc,0.71,0.045604802157206845,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.335,0.014933117490932575,0
3
  anli_r2,acc,0.337,0.014955087918653607,0
4
  anli_r3,acc,0.3408333333333333,0.013688600793296939,0
5
+ arc_challenge,acc,0.23720136518771331,0.012430399829260847,0
6
+ arc_challenge,acc_norm,0.2781569965870307,0.013094469919538805,0
7
+ arc_easy,acc,0.5698653198653199,0.010159130445178499,0
8
+ arc_easy,acc_norm,0.48947811447811446,0.010257511546488232,0
9
+ boolq,acc,0.6085626911314985,0.00853643052440395,1
10
  cb,acc,0.39285714285714285,0.0658538889806635,1
11
  cb,f1,0.1981981981981982,,1
12
  copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.43238398725353516,0.004943945069611459,0
14
+ hellaswag,acc_norm,0.5589523999203346,0.004954977202585478,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852209,0
16
+ piqa,acc_norm,0.7328618063112078,0.010323440492612416,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.818,0.012207580637662155,0
19
+ sciq,acc_norm,0.727,0.014095022868717588,0
20
+ storycloze_2016,acc,0.6884019241047569,0.010710200919679799,0
21
+ winogrande,acc,0.5714285714285714,0.013908353814606696,0
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_0_lm-eval_global_step52452_2023-02-25-10-40-14_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.335,
5
- "acc_stderr": 0.014933117490932575
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653607
10
- },
11
- "anli_r3": {
12
- "acc": 0.3408333333333333,
13
- "acc_stderr": 0.013688600793296939
14
- },
15
- "cb": {
16
- "acc": 0.39285714285714285,
17
- "acc_stderr": 0.0658538889806635,
18
- "f1": 0.1981981981981982
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.43238398725353516,
26
- "acc_stderr": 0.004943945069611459,
27
- "acc_norm": 0.5589523999203346,
28
- "acc_norm_stderr": 0.004954977202585478
29
- },
30
- "rte": {
31
- "acc": 0.5415162454873647,
32
- "acc_stderr": 0.029992535385373314
33
- },
34
- "winogrande": {
35
- "acc": 0.5714285714285714,
36
- "acc_stderr": 0.013908353814606696
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6884019241047569,
40
- "acc_stderr": 0.010710200919679799
41
- },
42
- "boolq": {
43
- "acc": 0.6085626911314985,
44
- "acc_stderr": 0.00853643052440395
45
- },
46
- "arc_easy": {
47
- "acc": 0.5698653198653199,
48
- "acc_stderr": 0.010159130445178499,
49
- "acc_norm": 0.48947811447811446,
50
- "acc_norm_stderr": 0.010257511546488232
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23720136518771331,
54
- "acc_stderr": 0.012430399829260847,
55
- "acc_norm": 0.2781569965870307,
56
- "acc_norm_stderr": 0.013094469919538805
57
- },
58
- "sciq": {
59
- "acc": 0.818,
60
- "acc_stderr": 0.012207580637662155,
61
- "acc_norm": 0.727,
62
- "acc_norm_stderr": 0.014095022868717588
63
- },
64
- "piqa": {
65
- "acc": 0.7301414581066377,
66
- "acc_stderr": 0.010356595421852209,
67
- "acc_norm": 0.7328618063112078,
68
- "acc_norm_stderr": 0.010323440492612416
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_1.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.352,0.01511040450564867,0
3
  anli_r2,acc,0.331,0.014888272588203934,0
4
  anli_r3,acc,0.335,0.013630871843821472,0
 
 
 
 
 
5
  cb,acc,0.42857142857142855,0.06672848092813058,1
6
  cb,f1,0.23895094706168044,,1
7
  copa,acc,0.73,0.044619604333847394,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.352,0.01511040450564867,0
3
  anli_r2,acc,0.331,0.014888272588203934,0
4
  anli_r3,acc,0.335,0.013630871843821472,0
5
+ arc_challenge,acc,0.257679180887372,0.012780770562768402,0
6
+ arc_challenge,acc_norm,0.28242320819112626,0.013155456884097224,0
7
+ arc_easy,acc,0.5694444444444444,0.010160345396860074,0
8
+ arc_easy,acc_norm,0.5269360269360269,0.010244884740620092,0
9
+ boolq,acc,0.6055045871559633,0.008548152025770937,1
10
  cb,acc,0.42857142857142855,0.06672848092813058,1
11
  cb,f1,0.23895094706168044,,1
12
  copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.4340768771161123,0.0049462215121452765,0
14
+ hellaswag,acc_norm,0.5611431985660227,0.0049523323781203235,0
15
+ piqa,acc,0.7317736670293797,0.010336761992404483,0
16
+ piqa,acc_norm,0.7247007616974973,0.01042142927736953,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.862,0.0109121526325044,0
19
+ sciq,acc_norm,0.82,0.01215515313551197,0
20
+ storycloze_2016,acc,0.6851950828433993,0.010740068943171383,0
21
+ winogrande,acc,0.5588003157063931,0.013954975072834734,0
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_1.json CHANGED
@@ -42,6 +42,30 @@
42
  "boolq": {
43
  "acc": 0.6055045871559633,
44
  "acc_stderr": 0.008548152025770937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  }
46
  },
47
  "versions": {
@@ -54,6 +78,10 @@
54
  "rte": 0,
55
  "winogrande": 0,
56
  "storycloze_2016": 0,
57
- "boolq": 1
 
 
 
 
58
  }
59
  }
 
42
  "boolq": {
43
  "acc": 0.6055045871559633,
44
  "acc_stderr": 0.008548152025770937
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5694444444444444,
48
+ "acc_stderr": 0.010160345396860074,
49
+ "acc_norm": 0.5269360269360269,
50
+ "acc_norm_stderr": 0.010244884740620092
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.257679180887372,
54
+ "acc_stderr": 0.012780770562768402,
55
+ "acc_norm": 0.28242320819112626,
56
+ "acc_norm_stderr": 0.013155456884097224
57
+ },
58
+ "sciq": {
59
+ "acc": 0.862,
60
+ "acc_stderr": 0.0109121526325044,
61
+ "acc_norm": 0.82,
62
+ "acc_norm_stderr": 0.01215515313551197
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7317736670293797,
66
+ "acc_stderr": 0.010336761992404483,
67
+ "acc_norm": 0.7247007616974973,
68
+ "acc_norm_stderr": 0.01042142927736953
69
  }
70
  },
71
  "versions": {
 
78
  "rte": 0,
79
  "winogrande": 0,
80
  "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_1_lm-eval_global_step52452_2023-02-25-10-40-14_1shots_backup.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.352,
5
- "acc_stderr": 0.01511040450564867
6
- },
7
- "anli_r2": {
8
- "acc": 0.331,
9
- "acc_stderr": 0.014888272588203934
10
- },
11
- "anli_r3": {
12
- "acc": 0.335,
13
- "acc_stderr": 0.013630871843821472
14
- },
15
- "cb": {
16
- "acc": 0.42857142857142855,
17
- "acc_stderr": 0.06672848092813058,
18
- "f1": 0.23895094706168044
19
- },
20
- "copa": {
21
- "acc": 0.73,
22
- "acc_stderr": 0.044619604333847394
23
- },
24
- "hellaswag": {
25
- "acc": 0.4340768771161123,
26
- "acc_stderr": 0.0049462215121452765,
27
- "acc_norm": 0.5611431985660227,
28
- "acc_norm_stderr": 0.0049523323781203235
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5588003157063931,
36
- "acc_stderr": 0.013954975072834734
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6851950828433993,
40
- "acc_stderr": 0.010740068943171383
41
- },
42
- "boolq": {
43
- "acc": 0.6055045871559633,
44
- "acc_stderr": 0.008548152025770937
45
- }
46
- },
47
- "versions": {
48
- "anli_r1": 0,
49
- "anli_r2": 0,
50
- "anli_r3": 0,
51
- "cb": 1,
52
- "copa": 0,
53
- "hellaswag": 0,
54
- "rte": 0,
55
- "winogrande": 0,
56
- "storycloze_2016": 0,
57
- "boolq": 1
58
- }
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_2.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.342,0.015008706182121731,0
3
  anli_r2,acc,0.327,0.014842213153411245,0
4
  anli_r3,acc,0.3283333333333333,0.013562032919529019,0
 
 
 
 
 
5
  cb,acc,0.375,0.06527912098338669,1
6
  cb,f1,0.21622522072297187,,1
7
  copa,acc,0.71,0.045604802157206845,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.342,0.015008706182121731,0
3
  anli_r2,acc,0.327,0.014842213153411245,0
4
  anli_r3,acc,0.3283333333333333,0.013562032919529019,0
5
+ arc_challenge,acc,0.26621160409556316,0.012915774781523209,0
6
+ arc_challenge,acc_norm,0.28668941979522183,0.013214986329274777,0
7
+ arc_easy,acc,0.5858585858585859,0.01010738767300251,0
8
+ arc_easy,acc_norm,0.5462962962962963,0.010215708295494117,0
9
+ boolq,acc,0.6070336391437309,0.00854233514797057,1
10
  cb,acc,0.375,0.06527912098338669,1
11
  cb,f1,0.21622522072297187,,1
12
  copa,acc,0.71,0.045604802157206845,0
13
+ hellaswag,acc,0.4314877514439355,0.004942716091996078,0
14
+ hellaswag,acc_norm,0.5613423620792671,0.004952087083128898,0
15
+ piqa,acc,0.7301414581066377,0.0103565954218522,0
16
+ piqa,acc_norm,0.7317736670293797,0.010336761992404485,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.873,0.010534798620855745,0
19
+ sciq,acc_norm,0.852,0.011234866364235254,0
20
+ storycloze_2016,acc,0.6835916622127205,0.010754780097940887,0
21
+ winogrande,acc,0.5367008681925809,0.01401457845884326,0
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_2.json CHANGED
@@ -34,6 +34,38 @@
34
  "winogrande": {
35
  "acc": 0.5367008681925809,
36
  "acc_stderr": 0.01401457845884326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
  },
39
  "versions": {
@@ -44,6 +76,12 @@
44
  "copa": 0,
45
  "hellaswag": 0,
46
  "rte": 0,
47
- "winogrande": 0
 
 
 
 
 
 
48
  }
49
  }
 
34
  "winogrande": {
35
  "acc": 0.5367008681925809,
36
  "acc_stderr": 0.01401457845884326
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6835916622127205,
40
+ "acc_stderr": 0.010754780097940887
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6070336391437309,
44
+ "acc_stderr": 0.00854233514797057
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5858585858585859,
48
+ "acc_stderr": 0.01010738767300251,
49
+ "acc_norm": 0.5462962962962963,
50
+ "acc_norm_stderr": 0.010215708295494117
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.26621160409556316,
54
+ "acc_stderr": 0.012915774781523209,
55
+ "acc_norm": 0.28668941979522183,
56
+ "acc_norm_stderr": 0.013214986329274777
57
+ },
58
+ "sciq": {
59
+ "acc": 0.873,
60
+ "acc_stderr": 0.010534798620855745,
61
+ "acc_norm": 0.852,
62
+ "acc_norm_stderr": 0.011234866364235254
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7301414581066377,
66
+ "acc_stderr": 0.0103565954218522,
67
+ "acc_norm": 0.7317736670293797,
68
+ "acc_norm_stderr": 0.010336761992404485
69
  }
70
  },
71
  "versions": {
 
76
  "copa": 0,
77
  "hellaswag": 0,
78
  "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_2_lm-eval_global_step52452_2023-02-25-10-40-14_2shots_backup.json DELETED
@@ -1,49 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.342,
5
- "acc_stderr": 0.015008706182121731
6
- },
7
- "anli_r2": {
8
- "acc": 0.327,
9
- "acc_stderr": 0.014842213153411245
10
- },
11
- "anli_r3": {
12
- "acc": 0.3283333333333333,
13
- "acc_stderr": 0.013562032919529019
14
- },
15
- "cb": {
16
- "acc": 0.375,
17
- "acc_stderr": 0.06527912098338669,
18
- "f1": 0.21622522072297187
19
- },
20
- "copa": {
21
- "acc": 0.71,
22
- "acc_stderr": 0.045604802157206845
23
- },
24
- "hellaswag": {
25
- "acc": 0.4314877514439355,
26
- "acc_stderr": 0.004942716091996078,
27
- "acc_norm": 0.5613423620792671,
28
- "acc_norm_stderr": 0.004952087083128898
29
- },
30
- "rte": {
31
- "acc": 0.5270758122743683,
32
- "acc_stderr": 0.030052303463143706
33
- },
34
- "winogrande": {
35
- "acc": 0.5367008681925809,
36
- "acc_stderr": 0.01401457845884326
37
- }
38
- },
39
- "versions": {
40
- "anli_r1": 0,
41
- "anli_r2": 0,
42
- "anli_r3": 0,
43
- "cb": 1,
44
- "copa": 0,
45
- "hellaswag": 0,
46
- "rte": 0,
47
- "winogrande": 0
48
- }
49
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_3.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.341,0.014998131348402709,0
3
  anli_r2,acc,0.342,0.01500870618212173,0
4
  anli_r3,acc,0.315,0.013415009084004862,0
 
 
 
 
 
5
  cb,acc,0.375,0.06527912098338669,1
6
  cb,f1,0.2852689793866265,,1
7
  copa,acc,0.76,0.042923469599092816,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.341,0.014998131348402709,0
3
  anli_r2,acc,0.342,0.01500870618212173,0
4
  anli_r3,acc,0.315,0.013415009084004862,0
5
+ arc_challenge,acc,0.28071672354948807,0.013131238126975574,0
6
+ arc_challenge,acc_norm,0.2909556313993174,0.013273077865907602,0
7
+ arc_easy,acc,0.5892255892255892,0.010095101349348648,0
8
+ arc_easy,acc_norm,0.555976430976431,0.010195285580783957,0
9
+ boolq,acc,0.6033639143730887,0.008556148582032002,1
10
  cb,acc,0.375,0.06527912098338669,1
11
  cb,f1,0.2852689793866265,,1
12
  copa,acc,0.76,0.042923469599092816,0
13
+ hellaswag,acc,0.43178649671380204,0.004943127583290516,0
14
+ hellaswag,acc_norm,0.5626369249153556,0.004950472918523315,0
15
+ piqa,acc,0.7421109902067464,0.010206956662056257,0
16
+ piqa,acc_norm,0.733949945593036,0.010310039263352826,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.872,0.010570133761108665,0
19
+ sciq,acc_norm,0.856,0.01110798754893915,0
20
+ storycloze_2016,acc,0.6937466595403528,0.010659088460112756,0
21
+ winogrande,acc,0.5564325177584846,0.013962694907620402,0
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_3.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.042923469599092816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.76,
22
  "acc_stderr": 0.042923469599092816
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.43178649671380204,
26
+ "acc_stderr": 0.004943127583290516,
27
+ "acc_norm": 0.5626369249153556,
28
+ "acc_norm_stderr": 0.004950472918523315
29
+ },
30
+ "rte": {
31
+ "acc": 0.5306859205776173,
32
+ "acc_stderr": 0.03003973059219781
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5564325177584846,
36
+ "acc_stderr": 0.013962694907620402
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6937466595403528,
40
+ "acc_stderr": 0.010659088460112756
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6033639143730887,
44
+ "acc_stderr": 0.008556148582032002
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5892255892255892,
48
+ "acc_stderr": 0.010095101349348648,
49
+ "acc_norm": 0.555976430976431,
50
+ "acc_norm_stderr": 0.010195285580783957
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.28071672354948807,
54
+ "acc_stderr": 0.013131238126975574,
55
+ "acc_norm": 0.2909556313993174,
56
+ "acc_norm_stderr": 0.013273077865907602
57
+ },
58
+ "sciq": {
59
+ "acc": 0.872,
60
+ "acc_stderr": 0.010570133761108665,
61
+ "acc_norm": 0.856,
62
+ "acc_norm_stderr": 0.01110798754893915
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7421109902067464,
66
+ "acc_stderr": 0.010206956662056257,
67
+ "acc_norm": 0.733949945593036,
68
+ "acc_norm_stderr": 0.010310039263352826
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_4.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.34,0.014987482264363935,0
3
  anli_r2,acc,0.335,0.014933117490932573,0
4
  anli_r3,acc,0.3466666666666667,0.013744022550571946,0
 
 
 
 
 
5
  cb,acc,0.42857142857142855,0.06672848092813058,1
6
  cb,f1,0.28729827005689074,,1
7
  copa,acc,0.73,0.044619604333847394,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.34,0.014987482264363935,0
3
  anli_r2,acc,0.335,0.014933117490932573,0
4
  anli_r3,acc,0.3466666666666667,0.013744022550571946,0
5
+ arc_challenge,acc,0.27559726962457337,0.01305716965576184,0
6
+ arc_challenge,acc_norm,0.29948805460750855,0.013385021637313574,0
7
+ arc_easy,acc,0.5808080808080808,0.010124905282491185,0
8
+ arc_easy,acc_norm,0.5681818181818182,0.010163945352271737,0
9
+ boolq,acc,0.6055045871559633,0.008548152025770937,1
10
  cb,acc,0.42857142857142855,0.06672848092813058,1
11
  cb,f1,0.28729827005689074,,1
12
  copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.4320852419836686,0.004943537242344419,0
14
+ hellaswag,acc_norm,0.5630352519418442,0.004949969363017664,0
15
+ piqa,acc,0.735582154515778,0.010289787244767165,0
16
+ piqa,acc_norm,0.7317736670293797,0.010336761992404485,0
17
+ rte,acc,0.51985559566787,0.030072723167317184,0
18
+ sciq,acc,0.882,0.010206869264381793,0
19
+ sciq,acc_norm,0.868,0.010709373963528033,0
20
+ storycloze_2016,acc,0.6942811330839124,0.0106538848661906,0
21
+ winogrande,acc,0.5493291239147593,0.01398392886904024,0
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_4.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.73,
22
  "acc_stderr": 0.044619604333847394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.73,
22
  "acc_stderr": 0.044619604333847394
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4320852419836686,
26
+ "acc_stderr": 0.004943537242344419,
27
+ "acc_norm": 0.5630352519418442,
28
+ "acc_norm_stderr": 0.004949969363017664
29
+ },
30
+ "rte": {
31
+ "acc": 0.51985559566787,
32
+ "acc_stderr": 0.030072723167317184
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5493291239147593,
36
+ "acc_stderr": 0.01398392886904024
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6942811330839124,
40
+ "acc_stderr": 0.0106538848661906
41
+ },
42
+ "boolq": {
43
+ "acc": 0.6055045871559633,
44
+ "acc_stderr": 0.008548152025770937
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5808080808080808,
48
+ "acc_stderr": 0.010124905282491185,
49
+ "acc_norm": 0.5681818181818182,
50
+ "acc_norm_stderr": 0.010163945352271737
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.27559726962457337,
54
+ "acc_stderr": 0.01305716965576184,
55
+ "acc_norm": 0.29948805460750855,
56
+ "acc_norm_stderr": 0.013385021637313574
57
+ },
58
+ "sciq": {
59
+ "acc": 0.882,
60
+ "acc_stderr": 0.010206869264381793,
61
+ "acc_norm": 0.868,
62
+ "acc_norm_stderr": 0.010709373963528033
63
+ },
64
+ "piqa": {
65
+ "acc": 0.735582154515778,
66
+ "acc_stderr": 0.010289787244767165,
67
+ "acc_norm": 0.7317736670293797,
68
+ "acc_norm_stderr": 0.010336761992404485
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_5.csv CHANGED
@@ -2,6 +2,20 @@ task,metric,value,err,version
2
  anli_r1,acc,0.346,0.01505026612756444,0
3
  anli_r2,acc,0.346,0.01505026612756444,0
4
  anli_r3,acc,0.3358333333333333,0.013639261190932889,0
 
 
 
 
 
5
  cb,acc,0.375,0.06527912098338669,1
6
  cb,f1,0.21545893719806763,,1
7
  copa,acc,0.73,0.044619604333847394,0
 
 
 
 
 
 
 
 
 
 
2
  anli_r1,acc,0.346,0.01505026612756444,0
3
  anli_r2,acc,0.346,0.01505026612756444,0
4
  anli_r3,acc,0.3358333333333333,0.013639261190932889,0
5
+ arc_challenge,acc,0.28668941979522183,0.013214986329274783,0
6
+ arc_challenge,acc_norm,0.30204778156996587,0.01341751914471641,0
7
+ arc_easy,acc,0.5854377104377104,0.010108889212447772,0
8
+ arc_easy,acc_norm,0.555976430976431,0.01019528558078396,0
9
+ boolq,acc,0.608868501529052,0.008535239054221166,1
10
  cb,acc,0.375,0.06527912098338669,1
11
  cb,f1,0.21545893719806763,,1
12
  copa,acc,0.73,0.044619604333847394,0
13
+ hellaswag,acc,0.4302927703644692,0.00494105179521479,0
14
+ hellaswag,acc_norm,0.5671181039633539,0.004944620712318274,0
15
+ piqa,acc,0.7301414581066377,0.010356595421852209,0
16
+ piqa,acc_norm,0.733949945593036,0.01031003926335282,0
17
+ rte,acc,0.5270758122743683,0.030052303463143706,0
18
+ sciq,acc,0.881,0.010244215145336662,0
19
+ sciq,acc_norm,0.868,0.01070937396352803,0
20
+ storycloze_2016,acc,0.6926777124532336,0.01066944508186666,0
21
+ winogrande,acc,0.5603788476716653,0.013949649776015703,0
2b855b9bc4seed4/evaluation/rankeval/2b855b9bc4seed4_5.json CHANGED
@@ -20,6 +20,52 @@
20
  "copa": {
21
  "acc": 0.73,
22
  "acc_stderr": 0.044619604333847394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
  },
25
  "versions": {
@@ -27,6 +73,15 @@
27
  "anli_r2": 0,
28
  "anli_r3": 0,
29
  "cb": 1,
30
- "copa": 0
 
 
 
 
 
 
 
 
 
31
  }
32
  }
 
20
  "copa": {
21
  "acc": 0.73,
22
  "acc_stderr": 0.044619604333847394
23
+ },
24
+ "hellaswag": {
25
+ "acc": 0.4302927703644692,
26
+ "acc_stderr": 0.00494105179521479,
27
+ "acc_norm": 0.5671181039633539,
28
+ "acc_norm_stderr": 0.004944620712318274
29
+ },
30
+ "rte": {
31
+ "acc": 0.5270758122743683,
32
+ "acc_stderr": 0.030052303463143706
33
+ },
34
+ "winogrande": {
35
+ "acc": 0.5603788476716653,
36
+ "acc_stderr": 0.013949649776015703
37
+ },
38
+ "storycloze_2016": {
39
+ "acc": 0.6926777124532336,
40
+ "acc_stderr": 0.01066944508186666
41
+ },
42
+ "boolq": {
43
+ "acc": 0.608868501529052,
44
+ "acc_stderr": 0.008535239054221166
45
+ },
46
+ "arc_easy": {
47
+ "acc": 0.5854377104377104,
48
+ "acc_stderr": 0.010108889212447772,
49
+ "acc_norm": 0.555976430976431,
50
+ "acc_norm_stderr": 0.01019528558078396
51
+ },
52
+ "arc_challenge": {
53
+ "acc": 0.28668941979522183,
54
+ "acc_stderr": 0.013214986329274783,
55
+ "acc_norm": 0.30204778156996587,
56
+ "acc_norm_stderr": 0.01341751914471641
57
+ },
58
+ "sciq": {
59
+ "acc": 0.881,
60
+ "acc_stderr": 0.010244215145336662,
61
+ "acc_norm": 0.868,
62
+ "acc_norm_stderr": 0.01070937396352803
63
+ },
64
+ "piqa": {
65
+ "acc": 0.7301414581066377,
66
+ "acc_stderr": 0.010356595421852209,
67
+ "acc_norm": 0.733949945593036,
68
+ "acc_norm_stderr": 0.01031003926335282
69
  }
70
  },
71
  "versions": {
 
73
  "anli_r2": 0,
74
  "anli_r3": 0,
75
  "cb": 1,
76
+ "copa": 0,
77
+ "hellaswag": 0,
78
+ "rte": 0,
79
+ "winogrande": 0,
80
+ "storycloze_2016": 0,
81
+ "boolq": 1,
82
+ "arc_easy": 0,
83
+ "arc_challenge": 0,
84
+ "sciq": 0,
85
+ "piqa": 0
86
  }
87
  }