synth_roberta / per_sample_predictions.csv
wlg1's picture
Upload folder using huggingface_hub
3f923ae verified
raw
history blame
12.3 kB
logit_gpt-4o,logit_DeepSeek-V3-0324,logit_Llama-4-maverick-17b-128e-instruct-fp8,logit_qwen25-coder-32b-instruct,logit_gpt-4.1-mini,chosen_executor,correct_prediction,true_gpt-4o,true_DeepSeek-V3-0324,true_Llama-4-maverick-17b-128e-instruct-fp8,true_qwen25-coder-32b-instruct,true_gpt-4.1-mini
0.032958984375,0.06298828125,-0.05224609375,0.083984375,0.1201171875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
0.0712890625,0.046875,-0.0264892578125,0.1416015625,-0.01190185546875,qwen25-coder-32b-instruct,1,1.0,1.0,0.0,1.0,1.0
0.1611328125,0.1923828125,0.18359375,0.25390625,0.3671875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.0732421875,-0.12109375,-0.193359375,-0.10498046875,-0.1259765625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.2578125,-0.17578125,-0.2265625,-0.10986328125,-0.2158203125,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
-0.267578125,-0.2177734375,-0.310546875,-0.337890625,-0.310546875,DeepSeek-V3-0324,1,0.0,1.0,0.0,0.0,0.0
0.11376953125,0.0272216796875,-0.053466796875,0.058837890625,0.08203125,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
-0.267578125,-0.380859375,-0.4140625,-0.3515625,-0.59765625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.2041015625,-0.33203125,-0.39453125,-0.328125,-0.349609375,gpt-4o,1,1.0,1.0,0.0,1.0,1.0
-0.045166015625,-0.0791015625,-0.07275390625,-0.0091552734375,0.0634765625,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.11328125,-0.1103515625,-0.162109375,-0.1455078125,-0.1787109375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0
0.043212890625,0.08544921875,0.09033203125,0.1025390625,0.1572265625,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.0380859375,-0.0888671875,-0.24609375,-0.12158203125,-0.047607421875,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.17578125,-0.251953125,-0.259765625,-0.2021484375,-0.2119140625,gpt-4o,1,1.0,0.0,0.0,1.0,0.0
0.040771484375,0.0654296875,-0.051513671875,-0.0205078125,0.16796875,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.00095367431640625,-0.203125,-0.1884765625,-0.154296875,-0.158203125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.318359375,-0.423828125,-0.408203125,-0.38671875,-0.61328125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.051025390625,-0.05615234375,-0.2109375,0.0169677734375,0.005889892578125,qwen25-coder-32b-instruct,0,0.0,1.0,0.0,0.0,0.0
-0.0191650390625,-0.0693359375,-0.140625,-0.103515625,-0.004852294921875,gpt-4.1-mini,0,1.0,1.0,1.0,1.0,0.0
-0.11962890625,-0.18359375,-0.2236328125,-0.12060546875,-0.12060546875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
0.1943359375,0.1572265625,0.1640625,0.205078125,0.31640625,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.228515625,-0.1474609375,-0.2890625,-0.30078125,-0.302734375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0
-0.1943359375,-0.38671875,-0.388671875,-0.1953125,-0.267578125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.423828125,-0.251953125,-0.31640625,-0.2890625,-0.458984375,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0
-0.2421875,-0.2578125,-0.326171875,-0.228515625,-0.4140625,qwen25-coder-32b-instruct,0,0.0,0.0,1.0,0.0,0.0
0.07568359375,0.1005859375,0.0264892578125,0.234375,0.2734375,gpt-4.1-mini,1,1.0,1.0,0.0,1.0,1.0
-0.2294921875,-0.474609375,-0.296875,-0.36328125,-0.6015625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.057861328125,-0.083984375,-0.1435546875,-0.0189208984375,-0.0033111572265625,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
0.296875,0.1376953125,0.09716796875,0.275390625,0.427734375,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
0.00335693359375,-0.0673828125,-0.1103515625,-0.0595703125,0.0247802734375,gpt-4.1-mini,1,0.0,1.0,1.0,1.0,1.0
-0.169921875,-0.28515625,-0.400390625,-0.1533203125,-0.3828125,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
0.00665283203125,-0.07470703125,-0.1162109375,-0.150390625,0.0306396484375,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.283203125,-0.3046875,-0.431640625,-0.21484375,-0.44140625,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
0.154296875,0.109375,0.00323486328125,0.177734375,0.279296875,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.095703125,-0.236328125,-0.28125,-0.0908203125,-0.17578125,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0
-0.04150390625,-0.0908203125,-0.154296875,-0.07373046875,-0.030029296875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.35546875,-0.326171875,-0.37890625,-0.328125,-0.46875,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0
0.004913330078125,-0.00799560546875,-0.07666015625,0.000759124755859375,0.134765625,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.318359375,-0.296875,-0.244140625,-0.34765625,-0.439453125,Llama-4-maverick-17b-128e-instruct-fp8,1,0.0,1.0,1.0,0.0,1.0
-0.044189453125,-0.08203125,-0.09130859375,-0.03271484375,-0.0771484375,qwen25-coder-32b-instruct,1,1.0,0.0,0.0,1.0,0.0
-0.16015625,-0.37890625,-0.421875,-0.1630859375,-0.22265625,gpt-4o,0,0.0,1.0,1.0,1.0,1.0
-0.197265625,-0.224609375,-0.1767578125,-0.22265625,-0.3203125,Llama-4-maverick-17b-128e-instruct-fp8,0,0.0,0.0,0.0,0.0,1.0
0.1474609375,0.0439453125,-0.05908203125,0.158203125,0.12451171875,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
0.28515625,0.31640625,0.15234375,0.39453125,0.578125,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.032958984375,-0.054931640625,-0.053955078125,-0.07861328125,0.04541015625,gpt-4.1-mini,1,1.0,1.0,0.0,1.0,1.0
0.1396484375,0.1474609375,0.0230712890625,0.259765625,0.220703125,qwen25-coder-32b-instruct,1,0.0,0.0,0.0,1.0,0.0
0.007659912109375,-0.07177734375,-0.08056640625,-0.0162353515625,-0.0859375,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.1708984375,-0.12890625,-0.12158203125,-0.2001953125,-0.2216796875,Llama-4-maverick-17b-128e-instruct-fp8,1,1.0,1.0,1.0,1.0,1.0
-0.1435546875,-0.41015625,-0.330078125,-0.228515625,-0.48828125,gpt-4o,1,1.0,0.0,0.0,1.0,0.0
0.06689453125,0.0101318359375,0.037841796875,0.0303955078125,0.1279296875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.06640625,-0.09033203125,-0.2001953125,-0.10546875,-0.07470703125,gpt-4o,0,0.0,1.0,1.0,0.0,1.0
-0.2412109375,-0.2734375,-0.18359375,-0.28125,-0.388671875,Llama-4-maverick-17b-128e-instruct-fp8,1,0.0,0.0,1.0,0.0,1.0
0.296875,0.337890625,0.2734375,0.39453125,0.5234375,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
0.1845703125,0.1240234375,0.0189208984375,0.27734375,0.2451171875,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0
-0.0908203125,-0.064453125,-0.2041015625,-0.203125,-0.1689453125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0
-0.12353515625,-0.314453125,-0.40625,-0.25390625,-0.416015625,gpt-4o,1,1.0,0.0,1.0,0.0,1.0
-0.1884765625,-0.39453125,-0.39453125,-0.1904296875,-0.2421875,gpt-4o,1,1.0,1.0,0.0,1.0,0.0
0.1669921875,0.2158203125,0.15625,0.251953125,0.333984375,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.00946044921875,-0.052001953125,-0.0302734375,0.01531982421875,0.036865234375,gpt-4.1-mini,1,1.0,1.0,1.0,0.0,1.0
-0.1396484375,-0.296875,-0.3515625,-0.361328125,-0.466796875,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
0.06396484375,0.1279296875,0.0654296875,0.04931640625,0.2431640625,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
0.2255859375,0.1767578125,0.1884765625,0.259765625,0.392578125,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.2578125,-0.314453125,-0.37109375,-0.341796875,-0.478515625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.134765625,-0.1484375,-0.3046875,-0.193359375,-0.2197265625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
0.00927734375,-0.0654296875,-0.15625,-0.01373291015625,-0.015869140625,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
0.002655029296875,-0.0849609375,-0.0361328125,-0.0233154296875,-0.022216796875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
-0.16796875,-0.2080078125,-0.232421875,-0.0279541015625,-0.1708984375,qwen25-coder-32b-instruct,1,0.0,0.0,0.0,1.0,0.0
0.0537109375,0.04931640625,0.025390625,0.10498046875,0.1552734375,gpt-4.1-mini,1,1.0,0.0,1.0,1.0,1.0
-0.1474609375,-0.232421875,-0.255859375,-0.15234375,-0.228515625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
0.0361328125,0.028076171875,-0.09130859375,0.08935546875,0.1279296875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.006103515625,-0.06591796875,-0.2177734375,0.06005859375,0.11328125,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.19921875,-0.322265625,-0.294921875,-0.2333984375,-0.44921875,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.034423828125,-0.0238037109375,-0.12890625,-0.06787109375,-0.024169921875,DeepSeek-V3-0324,0,1.0,0.0,0.0,0.0,0.0
0.11962890625,-0.0380859375,0.045166015625,0.1171875,0.0498046875,gpt-4o,0,0.0,1.0,0.0,0.0,0.0
0.1552734375,0.0162353515625,-0.0234375,0.177734375,0.296875,gpt-4.1-mini,1,1.0,1.0,1.0,0.0,1.0
-0.259765625,-0.318359375,-0.376953125,-0.291015625,-0.380859375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
-0.0184326171875,-0.03466796875,-0.0245361328125,0.07861328125,0.09521484375,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
0.0771484375,0.06201171875,0.01470947265625,0.10791015625,0.2138671875,gpt-4.1-mini,0,0.0,1.0,1.0,0.0,0.0
-0.0294189453125,-0.034423828125,-0.1904296875,0.0478515625,0.10888671875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.11083984375,-0.024169921875,-0.14453125,-0.08251953125,-0.0703125,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,0.0
0.055908203125,0.08984375,0.043701171875,0.130859375,0.138671875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.15234375,-0.236328125,-0.275390625,-0.1767578125,-0.234375,gpt-4o,1,1.0,1.0,1.0,1.0,0.0
0.169921875,0.051025390625,0.09716796875,0.158203125,0.21875,gpt-4.1-mini,1,0.0,0.0,0.0,0.0,1.0
-0.17578125,-0.3359375,-0.328125,-0.302734375,-0.376953125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.03564453125,-0.1201171875,-0.05126953125,-0.00372314453125,0.010009765625,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.125,-0.263671875,-0.263671875,-0.162109375,-0.251953125,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
0.03564453125,-0.0284423828125,-0.083984375,-0.07763671875,0.083984375,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.06494140625,-0.1953125,-0.1982421875,-0.166015625,-0.169921875,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
-0.3125,-0.1845703125,-0.236328125,-0.2197265625,-0.392578125,DeepSeek-V3-0324,0,0.0,0.0,0.0,0.0,0.0
-0.058349609375,-0.09375,-0.068359375,0.01043701171875,-0.0966796875,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
-0.0673828125,-0.12060546875,-0.1484375,-0.1982421875,-0.05419921875,gpt-4.1-mini,1,1.0,0.0,1.0,0.0,1.0
-0.306640625,-0.251953125,-0.375,-0.23828125,-0.455078125,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0
-0.10302734375,-0.1416015625,-0.236328125,-0.0184326171875,-0.037841796875,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
-0.0244140625,-0.1748046875,-0.248046875,-0.031982421875,-0.00726318359375,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.142578125,-0.0908203125,-0.189453125,-0.041259765625,-0.2470703125,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
-0.2412109375,-0.435546875,-0.4453125,-0.296875,-0.53515625,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
-0.0223388671875,-0.0869140625,-0.1611328125,-0.0093994140625,-0.076171875,qwen25-coder-32b-instruct,0,1.0,0.0,0.0,0.0,1.0
-0.12451171875,-0.11962890625,-0.1435546875,-0.04248046875,-0.1533203125,qwen25-coder-32b-instruct,1,1.0,1.0,1.0,1.0,1.0
-0.1103515625,-0.330078125,-0.3359375,-0.1337890625,-0.173828125,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
-0.1943359375,-0.2060546875,-0.259765625,-0.2294921875,-0.1796875,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
0.0306396484375,0.031982421875,0.00921630859375,-0.045166015625,0.05419921875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.09765625,-0.12890625,-0.265625,-0.11181640625,-0.044677734375,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.171875,-0.14453125,-0.26953125,-0.2431640625,-0.212890625,DeepSeek-V3-0324,1,1.0,1.0,1.0,1.0,1.0
-0.04833984375,-0.125,-0.142578125,-0.04150390625,-0.041748046875,qwen25-coder-32b-instruct,0,0.0,0.0,0.0,0.0,0.0
-0.02294921875,-0.0712890625,-0.1474609375,-0.0286865234375,-0.06494140625,gpt-4o,0,0.0,0.0,0.0,0.0,0.0
-0.296875,-0.328125,-0.416015625,-0.365234375,-0.34765625,gpt-4o,1,1.0,1.0,0.0,0.0,1.0
-0.203125,-0.2041015625,-0.228515625,-0.21875,-0.267578125,gpt-4o,1,1.0,1.0,1.0,0.0,1.0
0.189453125,0.146484375,0.1875,0.09375,0.255859375,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
0.2041015625,0.173828125,0.1142578125,0.259765625,0.310546875,gpt-4.1-mini,0,0.0,0.0,0.0,0.0,0.0
-0.2080078125,-0.2021484375,-0.31640625,-0.255859375,-0.23828125,DeepSeek-V3-0324,1,0.0,1.0,0.0,0.0,0.0
-0.11572265625,-0.2490234375,-0.318359375,-0.23046875,-0.318359375,gpt-4o,1,1.0,1.0,1.0,1.0,1.0
0.162109375,0.224609375,0.19140625,0.2412109375,0.423828125,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
-0.02685546875,-0.09228515625,-0.1748046875,-0.08447265625,-0.022216796875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
0.2177734375,0.14453125,0.1552734375,0.201171875,0.357421875,gpt-4.1-mini,1,1.0,1.0,1.0,1.0,1.0
predicted_proportions,0.3158,0.0965,0.0351,0.1667,0.3860
true_proportions,0.1288,0.1442,0.1156,0.1222,0.1295
correct_fraction,0.5526