| dataset,prompt,metric,value | |
| anli_dev_r1,GPT-3 style,accuracy,0.426 | |
| anli_dev_r1,MNLI crowdsource,accuracy,0.402 | |
| anli_dev_r1,can we infer,accuracy,0.401 | |
| anli_dev_r1,guaranteed/possible/impossible,accuracy,0.314 | |
| anli_dev_r1,justified in saying,accuracy,0.387 | |
| anli_dev_r1,median,accuracy,0.401 | |
| anli_dev_r2,GPT-3 style,accuracy,0.383 | |
| anli_dev_r2,MNLI crowdsource,accuracy,0.374 | |
| anli_dev_r2,can we infer,accuracy,0.394 | |
| anli_dev_r2,guaranteed/possible/impossible,accuracy,0.302 | |
| anli_dev_r2,justified in saying,accuracy,0.376 | |
| anli_dev_r2,median,accuracy,0.376 | |
| anli_dev_r3,GPT-3 style,accuracy,0.42 | |
| anli_dev_r3,MNLI crowdsource,accuracy,0.4116666666666667 | |
| anli_dev_r3,can we infer,accuracy,0.38916666666666666 | |
| anli_dev_r3,guaranteed/possible/impossible,accuracy,0.2966666666666667 | |
| anli_dev_r3,justified in saying,accuracy,0.35833333333333334 | |
| anli_dev_r3,median,accuracy,0.38916666666666666 | |
| story_cloze_2016,Answer Given options,accuracy,0.8524853019775521 | |
| story_cloze_2016,Choose Story Ending,accuracy,0.8957776590058792 | |
| story_cloze_2016,Generate Ending,accuracy,0.709246392303581 | |
| story_cloze_2016,Novel Correct Ending,accuracy,0.8888295029396045 | |
| story_cloze_2016,Story Continuation and Options,accuracy,0.8850881881346874 | |
| story_cloze_2016,median,accuracy,0.8850881881346874 | |
| super_glue_cb,GPT-3 style,accuracy,0.8392857142857143 | |
| super_glue_cb,MNLI crowdsource,accuracy,0.35714285714285715 | |
| super_glue_cb,can we infer,accuracy,0.7857142857142857 | |
| super_glue_cb,guaranteed/possible/impossible,accuracy,0.5535714285714286 | |
| super_glue_cb,justified in saying,accuracy,0.7142857142857143 | |
| super_glue_cb,median,accuracy,0.7142857142857143 | |
| super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.66 | |
| super_glue_copa,best_option,accuracy,0.77 | |
| super_glue_copa,cause_effect,accuracy,0.8 | |
| super_glue_copa,i_am_hesitating,accuracy,0.81 | |
| super_glue_copa,plausible_alternatives,accuracy,0.84 | |
| super_glue_copa,median,accuracy,0.8 | |
| super_glue_rte,GPT-3 style,accuracy,0.7906137184115524 | |
| super_glue_rte,MNLI crowdsource,accuracy,0.8267148014440433 | |
| super_glue_rte,does it follow that,accuracy,0.7942238267148014 | |
| super_glue_rte,guaranteed true,accuracy,0.776173285198556 | |
| super_glue_rte,should assume,accuracy,0.7617328519855595 | |
| super_glue_rte,median,accuracy,0.7906137184115524 | |
| winogrande_winogrande_xl,Replace,accuracy,0.5588003157063931 | |
| winogrande_winogrande_xl,True or False,accuracy,0.5280189423835833 | |
| winogrande_winogrande_xl,does underscore refer to,accuracy,0.5651144435674822 | |
| winogrande_winogrande_xl,stand for,accuracy,0.5082872928176796 | |
| winogrande_winogrande_xl,underscore refer to,accuracy,0.5651144435674822 | |
| winogrande_winogrande_xl,median,accuracy,0.5588003157063931 | |
| xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.46 | |
| xcopa_id,best_option,accuracy,0.7 | |
| xcopa_id,cause_effect,accuracy,0.73 | |
| xcopa_id,i_am_hesitating,accuracy,0.72 | |
| xcopa_id,plausible_alternatives,accuracy,0.67 | |
| xcopa_id,median,accuracy,0.7 | |
| xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.6 | |
| xcopa_sw,best_option,accuracy,0.55 | |
| xcopa_sw,cause_effect,accuracy,0.54 | |
| xcopa_sw,i_am_hesitating,accuracy,0.51 | |
| xcopa_sw,plausible_alternatives,accuracy,0.52 | |
| xcopa_sw,median,accuracy,0.54 | |
| xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.59 | |
| xcopa_ta,best_option,accuracy,0.56 | |
| xcopa_ta,cause_effect,accuracy,0.6 | |
| xcopa_ta,i_am_hesitating,accuracy,0.57 | |
| xcopa_ta,plausible_alternatives,accuracy,0.62 | |
| xcopa_ta,median,accuracy,0.59 | |
| xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.53 | |
| xcopa_vi,best_option,accuracy,0.72 | |
| xcopa_vi,cause_effect,accuracy,0.72 | |
| xcopa_vi,i_am_hesitating,accuracy,0.7 | |
| xcopa_vi,plausible_alternatives,accuracy,0.71 | |
| xcopa_vi,median,accuracy,0.71 | |
| xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.67 | |
| xcopa_zh,best_option,accuracy,0.7 | |
| xcopa_zh,cause_effect,accuracy,0.8 | |
| xcopa_zh,i_am_hesitating,accuracy,0.77 | |
| xcopa_zh,plausible_alternatives,accuracy,0.79 | |
| xcopa_zh,median,accuracy,0.77 | |
| xnli_ar,GPT-3 style,accuracy,0.5558232931726907 | |
| xnli_ar,MNLI crowdsource,accuracy,0.42128514056224897 | |
| xnli_ar,can we infer,accuracy,0.5148594377510041 | |
| xnli_ar,guaranteed/possible/impossible,accuracy,0.40562248995983935 | |
| xnli_ar,justified in saying,accuracy,0.4927710843373494 | |
| xnli_ar,median,accuracy,0.4927710843373494 | |
| xnli_en,GPT-3 style,accuracy,0.5891566265060241 | |
| xnli_en,MNLI crowdsource,accuracy,0.42610441767068274 | |
| xnli_en,can we infer,accuracy,0.5662650602409639 | |
| xnli_en,guaranteed/possible/impossible,accuracy,0.4614457831325301 | |
| xnli_en,justified in saying,accuracy,0.5437751004016064 | |
| xnli_en,median,accuracy,0.5437751004016064 | |
| xnli_es,GPT-3 style,accuracy,0.5734939759036145 | |
| xnli_es,MNLI crowdsource,accuracy,0.40923694779116465 | |
| xnli_es,can we infer,accuracy,0.5148594377510041 | |
| xnli_es,guaranteed/possible/impossible,accuracy,0.43132530120481927 | |
| xnli_es,justified in saying,accuracy,0.4610441767068273 | |
| xnli_es,median,accuracy,0.4610441767068273 | |
| xnli_fr,GPT-3 style,accuracy,0.5666666666666667 | |
| xnli_fr,MNLI crowdsource,accuracy,0.42208835341365464 | |
| xnli_fr,can we infer,accuracy,0.5385542168674698 | |
| xnli_fr,guaranteed/possible/impossible,accuracy,0.39076305220883534 | |
| xnli_fr,justified in saying,accuracy,0.5100401606425703 | |
| xnli_fr,median,accuracy,0.5100401606425703 | |
| xnli_hi,GPT-3 style,accuracy,0.5345381526104418 | |
| xnli_hi,MNLI crowdsource,accuracy,0.41124497991967873 | |
| xnli_hi,can we infer,accuracy,0.4751004016064257 | |
| xnli_hi,guaranteed/possible/impossible,accuracy,0.40923694779116465 | |
| xnli_hi,justified in saying,accuracy,0.4469879518072289 | |
| xnli_hi,median,accuracy,0.4469879518072289 | |
| xnli_sw,GPT-3 style,accuracy,0.4827309236947791 | |
| xnli_sw,MNLI crowdsource,accuracy,0.40562248995983935 | |
| xnli_sw,can we infer,accuracy,0.44497991967871486 | |
| xnli_sw,guaranteed/possible/impossible,accuracy,0.42289156626506025 | |
| xnli_sw,justified in saying,accuracy,0.41124497991967873 | |
| xnli_sw,median,accuracy,0.42289156626506025 | |
| xnli_ur,GPT-3 style,accuracy,0.4947791164658635 | |
| xnli_ur,MNLI crowdsource,accuracy,0.39759036144578314 | |
| xnli_ur,can we infer,accuracy,0.4502008032128514 | |
| xnli_ur,guaranteed/possible/impossible,accuracy,0.39036144578313253 | |
| xnli_ur,justified in saying,accuracy,0.40843373493975904 | |
| xnli_ur,median,accuracy,0.40843373493975904 | |
| xnli_vi,GPT-3 style,accuracy,0.5449799196787148 | |
| xnli_vi,MNLI crowdsource,accuracy,0.40401606425702813 | |
| xnli_vi,can we infer,accuracy,0.5 | |
| xnli_vi,guaranteed/possible/impossible,accuracy,0.44779116465863456 | |
| xnli_vi,justified in saying,accuracy,0.4650602409638554 | |
| xnli_vi,median,accuracy,0.4650602409638554 | |
| xnli_zh,GPT-3 style,accuracy,0.5429718875502008 | |
| xnli_zh,MNLI crowdsource,accuracy,0.3891566265060241 | |
| xnli_zh,can we infer,accuracy,0.5032128514056224 | |
| xnli_zh,guaranteed/possible/impossible,accuracy,0.38072289156626504 | |
| xnli_zh,justified in saying,accuracy,0.4706827309236948 | |
| xnli_zh,median,accuracy,0.4706827309236948 | |
| xstory_cloze_ar,Answer Given options,accuracy,0.6896095301125083 | |
| xstory_cloze_ar,Choose Story Ending,accuracy,0.8378557246856386 | |
| xstory_cloze_ar,Generate Ending,accuracy,0.5956320317670417 | |
| xstory_cloze_ar,Novel Correct Ending,accuracy,0.8213103904698875 | |
| xstory_cloze_ar,Story Continuation and Options,accuracy,0.8219722038385175 | |
| xstory_cloze_ar,median,accuracy,0.8213103904698875 | |
| xstory_cloze_es,Answer Given options,accuracy,0.7683653209794837 | |
| xstory_cloze_es,Choose Story Ending,accuracy,0.886168100595632 | |
| xstory_cloze_es,Generate Ending,accuracy,0.6724023825281271 | |
| xstory_cloze_es,Novel Correct Ending,accuracy,0.8676373262739907 | |
| xstory_cloze_es,Story Continuation and Options,accuracy,0.8769027134348114 | |
| xstory_cloze_es,median,accuracy,0.8676373262739907 | |
| xstory_cloze_eu,Answer Given options,accuracy,0.6082064857710126 | |
| xstory_cloze_eu,Choose Story Ending,accuracy,0.7266710787557908 | |
| xstory_cloze_eu,Generate Ending,accuracy,0.5552614162806089 | |
| xstory_cloze_eu,Novel Correct Ending,accuracy,0.700198544010589 | |
| xstory_cloze_eu,Story Continuation and Options,accuracy,0.7107875579086698 | |
| xstory_cloze_eu,median,accuracy,0.700198544010589 | |
| xstory_cloze_hi,Answer Given options,accuracy,0.6366644606221046 | |
| xstory_cloze_hi,Choose Story Ending,accuracy,0.7882197220383852 | |
| xstory_cloze_hi,Generate Ending,accuracy,0.5982792852415619 | |
| xstory_cloze_hi,Novel Correct Ending,accuracy,0.7485109199205824 | |
| xstory_cloze_hi,Story Continuation and Options,accuracy,0.7683653209794837 | |
| xstory_cloze_hi,median,accuracy,0.7485109199205824 | |
| xstory_cloze_id,Answer Given options,accuracy,0.7385837193911317 | |
| xstory_cloze_id,Choose Story Ending,accuracy,0.8332230311052283 | |
| xstory_cloze_id,Generate Ending,accuracy,0.6293845135671741 | |
| xstory_cloze_id,Novel Correct Ending,accuracy,0.7816015883520847 | |
| xstory_cloze_id,Story Continuation and Options,accuracy,0.8226340172071476 | |
| xstory_cloze_id,median,accuracy,0.7816015883520847 | |
| xstory_cloze_zh,Answer Given options,accuracy,0.7498345466578424 | |
| xstory_cloze_zh,Choose Story Ending,accuracy,0.8583719391131701 | |
| xstory_cloze_zh,Generate Ending,accuracy,0.6227663798808736 | |
| xstory_cloze_zh,Novel Correct Ending,accuracy,0.8405029781601588 | |
| xstory_cloze_zh,Story Continuation and Options,accuracy,0.8385175380542687 | |
| xstory_cloze_zh,median,accuracy,0.8385175380542687 | |
| xwinograd_en,Replace,accuracy,0.6576344086021505 | |
| xwinograd_en,True or False,accuracy,0.5187096774193548 | |
| xwinograd_en,does underscore refer to,accuracy,0.5931182795698925 | |
| xwinograd_en,stand for,accuracy,0.5070967741935484 | |
| xwinograd_en,underscore refer to,accuracy,0.6210752688172043 | |
| xwinograd_en,median,accuracy,0.5931182795698925 | |
| xwinograd_fr,Replace,accuracy,0.5180722891566265 | |
| xwinograd_fr,True or False,accuracy,0.5301204819277109 | |
| xwinograd_fr,does underscore refer to,accuracy,0.5542168674698795 | |
| xwinograd_fr,stand for,accuracy,0.5180722891566265 | |
| xwinograd_fr,underscore refer to,accuracy,0.5421686746987951 | |
| xwinograd_fr,median,accuracy,0.5301204819277109 | |
| xwinograd_pt,Replace,accuracy,0.5741444866920152 | |
| xwinograd_pt,True or False,accuracy,0.4790874524714829 | |
| xwinograd_pt,does underscore refer to,accuracy,0.55893536121673 | |
| xwinograd_pt,stand for,accuracy,0.5209125475285171 | |
| xwinograd_pt,underscore refer to,accuracy,0.5437262357414449 | |
| xwinograd_pt,median,accuracy,0.5437262357414449 | |
| xwinograd_zh,Replace,accuracy,0.626984126984127 | |
| xwinograd_zh,True or False,accuracy,0.503968253968254 | |
| xwinograd_zh,does underscore refer to,accuracy,0.5436507936507936 | |
| xwinograd_zh,stand for,accuracy,0.49007936507936506 | |
| xwinograd_zh,underscore refer to,accuracy,0.5535714285714286 | |
| xwinograd_zh,median,accuracy,0.5436507936507936 | |
| multiple,average,multiple,0.6067197952551315 | |