{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "A100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "898742a6f98943d3977d4135d1e46d32": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [], "layout": "IPY_MODEL_ca9bde4c77954f0abf7a857c91ff7c0d" } }, "4cc6d51e3ba543e0a452cf639ec63aba": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ffa7ff02e8854b00932365e1ae8bd42f", "placeholder": "​", "style": "IPY_MODEL_16c741f3fdd1476496a38f5a6373baa7", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "24907ddaa1d54b1e9b57ff2c61fa21bf": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_52fdc432c58e44bd855bd9a6033abc7c", "placeholder": "​", "style": "IPY_MODEL_b93bf39b377a4528bc498c05ce4c442e", "value": "" } }, "a62e342f5bd14499a1d020bc492f0ee6": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_f9172ffb0dd945fdbd11dbb7e7018afc", "style": "IPY_MODEL_01237f8ed3f44f31ac3f7cd32a6ded7e", "value": true } }, "45dd5ef8ab3948c3bd9a9bef684a5cc2": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_f60e62649041480db85fce6024185fcf", "style": "IPY_MODEL_0f7f963191414aa5bc76c837abe43b3d", "tooltip": "" } }, "4bb92b8331644d3caa920c359112b4e7": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6d5ba8176b084d45aaa77d580b68bc48", "placeholder": "​", "style": "IPY_MODEL_65fbfda7f3cf4a75bd2160322cf47ab9", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "ca9bde4c77954f0abf7a857c91ff7c0d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "ffa7ff02e8854b00932365e1ae8bd42f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "16c741f3fdd1476496a38f5a6373baa7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "52fdc432c58e44bd855bd9a6033abc7c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b93bf39b377a4528bc498c05ce4c442e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f9172ffb0dd945fdbd11dbb7e7018afc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "01237f8ed3f44f31ac3f7cd32a6ded7e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f60e62649041480db85fce6024185fcf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0f7f963191414aa5bc76c837abe43b3d": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "6d5ba8176b084d45aaa77d580b68bc48": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "65fbfda7f3cf4a75bd2160322cf47ab9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "57353db52e4c44949529881fd7b658c7": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4cddeffd06e744d0bb6441cc19b0a942", "placeholder": "​", "style": "IPY_MODEL_4d28301682be4c46b9ec2c97c468ea0a", "value": "Connecting..." } }, "4cddeffd06e744d0bb6441cc19b0a942": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4d28301682be4c46b9ec2c97c468ea0a": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f832dabb01de4e1b97bd7c21c7e0e9c6": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d7193e0d76af476ea4873ca54986738c", "IPY_MODEL_3a5fdf5c274a479594428c0867c91374", "IPY_MODEL_7bc0cad0326f44e1b4ddc5ef13bf2518" ], "layout": "IPY_MODEL_c98852c25a5842dabf169a2c74cf52b1" } }, "d7193e0d76af476ea4873ca54986738c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c202f602bd1e405d9837a1bc2bce5d42", "placeholder": "​", "style": "IPY_MODEL_69d7a592b912421db9eb32a41e141069", "value": "config.json: 100%" } }, "3a5fdf5c274a479594428c0867c91374": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_28a64c5b804440cb886ea0fb53aa1877", "max": 772, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_2a8f2e40a6df420f9ff1e1150054efcd", "value": 772 } }, "7bc0cad0326f44e1b4ddc5ef13bf2518": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8b13fcfdf7244095bcb8d95604105463", "placeholder": "​", "style": "IPY_MODEL_31bb547c74d14810b98cd11222ab0b4e", "value": " 772/772 [00:00<00:00, 71.8kB/s]" } }, "c98852c25a5842dabf169a2c74cf52b1": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c202f602bd1e405d9837a1bc2bce5d42": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "69d7a592b912421db9eb32a41e141069": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "28a64c5b804440cb886ea0fb53aa1877": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2a8f2e40a6df420f9ff1e1150054efcd": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "8b13fcfdf7244095bcb8d95604105463": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "31bb547c74d14810b98cd11222ab0b4e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "22c8ceff4cad4d2f9064b31fa8a119ac": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_dc3be8d1937247c5b909f862b8fd6b1c", "IPY_MODEL_a76f964b417b462f9faa50f3c672eb9c", "IPY_MODEL_a485a5ea4801495cb8498f0cfbfd8f11" ], "layout": "IPY_MODEL_a35ae39aca914af083536512e91be51c" } }, "dc3be8d1937247c5b909f862b8fd6b1c": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c83c9142abeb4db5bc6eeb1d893b69bc", "placeholder": "​", "style": "IPY_MODEL_0fce3a68b4a04f4698b7c378f2431a58", "value": "tf_model.h5: 100%" } }, "a76f964b417b462f9faa50f3c672eb9c": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b68b93a049834ff881b4e85396c47129", "max": 1246320936, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_d829bd4541654278ad01bbea3462b2e0", "value": 1246320936 } }, "a485a5ea4801495cb8498f0cfbfd8f11": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c2a4044283c24b0285bf32eb71d74824", "placeholder": "​", "style": "IPY_MODEL_b69bfcb2a3de4104b02bb26cc1892613", "value": " 1.25G/1.25G [00:05<00:00, 248MB/s]" } }, "a35ae39aca914af083536512e91be51c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c83c9142abeb4db5bc6eeb1d893b69bc": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0fce3a68b4a04f4698b7c378f2431a58": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b68b93a049834ff881b4e85396c47129": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d829bd4541654278ad01bbea3462b2e0": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c2a4044283c24b0285bf32eb71d74824": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b69bfcb2a3de4104b02bb26cc1892613": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f0571fa7c0dc4502b6057eb6c1e0f1a1": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_5c299c4c55124c3780b896428d2d36b2", "IPY_MODEL_0c757cfe5eaf46b5a1e0f2ad8af2ad65", "IPY_MODEL_0e3740b1cb904ddfad4b659e979c1768" ], "layout": "IPY_MODEL_87540dfec4fe4d3ba60dd14050bcac90" } }, "5c299c4c55124c3780b896428d2d36b2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b47685ec5c8147eba4ea353344da87ca", "placeholder": "​", "style": "IPY_MODEL_7694847c4d0e4defa9485f4bbbcc8b2b", "value": "tokenizer_config.json: 100%" } }, "0c757cfe5eaf46b5a1e0f2ad8af2ad65": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7bd649b1deaf48a9aa2a0dde5d3d92ab", "max": 453, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_90d7a474658d4822ad7e2d23b52edf42", "value": 453 } }, "0e3740b1cb904ddfad4b659e979c1768": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c137c41aedba44578b1734f91c5da313", "placeholder": "​", "style": "IPY_MODEL_cc09e9bb1c984cd49437736f8c6e6ba0", "value": " 453/453 [00:00<00:00, 44.4kB/s]" } }, "87540dfec4fe4d3ba60dd14050bcac90": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b47685ec5c8147eba4ea353344da87ca": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7694847c4d0e4defa9485f4bbbcc8b2b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "7bd649b1deaf48a9aa2a0dde5d3d92ab": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "90d7a474658d4822ad7e2d23b52edf42": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c137c41aedba44578b1734f91c5da313": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cc09e9bb1c984cd49437736f8c6e6ba0": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "8cae6715b19e4c46a3de982950045f31": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b3ad4244bd3d4def916013b9cdd0e529", "IPY_MODEL_36795ecb440e44a294472dc8b81d0121", "IPY_MODEL_c16066fd1c584b7aa9c11cf087deddaa" ], "layout": "IPY_MODEL_7f3c219cb89b49e4ad653aafc9a9fd60" } }, "b3ad4244bd3d4def916013b9cdd0e529": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_7be5a2a5bfee4719bac6649842744cd7", "placeholder": "​", "style": "IPY_MODEL_ae0853b3f4054cf08494eed2da4059f8", "value": "tokenizer.json: 100%" } }, "36795ecb440e44a294472dc8b81d0121": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d7cf7850b21745df946a44b36e227553", "max": 17082660, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_867a96191cbe48d7a22be575d748f04e", "value": 17082660 } }, "c16066fd1c584b7aa9c11cf087deddaa": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_31157ef17afc4da293f61c7440bdf73f", "placeholder": "​", "style": "IPY_MODEL_136be58a40e1430283ddbf04594b6433", "value": " 17.1M/17.1M [00:00<00:00, 43.9MB/s]" } }, "7f3c219cb89b49e4ad653aafc9a9fd60": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "7be5a2a5bfee4719bac6649842744cd7": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ae0853b3f4054cf08494eed2da4059f8": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d7cf7850b21745df946a44b36e227553": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "867a96191cbe48d7a22be575d748f04e": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "31157ef17afc4da293f61c7440bdf73f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "136be58a40e1430283ddbf04594b6433": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "21e8778f26404971b06b174aeb46e0a4": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_16cb57dfeda04dc693ce196fc5a2f9b2", "IPY_MODEL_e0ecf0c5ea2b44e0b0331acee5172588", "IPY_MODEL_0a4528fea6cf4be6af58cba8f55ae0a6" ], "layout": "IPY_MODEL_6022886bee0945bbb79e1135495850b4" } }, "16cb57dfeda04dc693ce196fc5a2f9b2": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_95c32e816fc14c089164377e3dd1eac8", "placeholder": "​", "style": "IPY_MODEL_0e80843dafc3438fb614eeb01be4650f", "value": "special_tokens_map.json: 100%" } }, "e0ecf0c5ea2b44e0b0331acee5172588": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e245e508678f46b99126ea7bff49bfbe", "max": 280, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_0f94be2ff070471d9d5dc71661b3016d", "value": 280 } }, "0a4528fea6cf4be6af58cba8f55ae0a6": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6a355e8ef31a4302a26b97024a7ac346", "placeholder": "​", "style": "IPY_MODEL_c7e7ae5f09d043cebb2ed3ba566f6223", "value": " 280/280 [00:00<00:00, 26.5kB/s]" } }, "6022886bee0945bbb79e1135495850b4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "95c32e816fc14c089164377e3dd1eac8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0e80843dafc3438fb614eeb01be4650f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e245e508678f46b99126ea7bff49bfbe": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0f94be2ff070471d9d5dc71661b3016d": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "6a355e8ef31a4302a26b97024a7ac346": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c7e7ae5f09d043cebb2ed3ba566f6223": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 17, "referenced_widgets": [ "898742a6f98943d3977d4135d1e46d32", "4cc6d51e3ba543e0a452cf639ec63aba", "24907ddaa1d54b1e9b57ff2c61fa21bf", "a62e342f5bd14499a1d020bc492f0ee6", "45dd5ef8ab3948c3bd9a9bef684a5cc2", "4bb92b8331644d3caa920c359112b4e7", "ca9bde4c77954f0abf7a857c91ff7c0d", "ffa7ff02e8854b00932365e1ae8bd42f", "16c741f3fdd1476496a38f5a6373baa7", "52fdc432c58e44bd855bd9a6033abc7c", "b93bf39b377a4528bc498c05ce4c442e", "f9172ffb0dd945fdbd11dbb7e7018afc", "01237f8ed3f44f31ac3f7cd32a6ded7e", "f60e62649041480db85fce6024185fcf", "0f7f963191414aa5bc76c837abe43b3d", "6d5ba8176b084d45aaa77d580b68bc48", "65fbfda7f3cf4a75bd2160322cf47ab9", "57353db52e4c44949529881fd7b658c7", "4cddeffd06e744d0bb6441cc19b0a942", "4d28301682be4c46b9ec2c97c468ea0a" ] }, "id": "yrM6ZzXldMLo", "outputId": "ca439ee7-bb5b-4389-fde0-0f18606fab50" }, "execution_count": 1, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Count
0POLGPT-4oPartialTrainNie brak im na szczęście plusów. To na pewno w...2919318Nie brak im na szczęście plusów. To na pewno w...59397
1POLAmazon-Nova-Lite-1.0RewrittenDevProsto z kryminału polazłem na plebanię do ks....543460Dzień po wydarzeniach złożonym z plebanii księ...113785
2POLAmazon-Nova-Lite-1.0PartialTestRyszard spotyka się profesorem w sprawie remon...2114913Ryszard spotyka się profesorem w sprawie remon...107732
3POLAya-23PartialTest- Ciekawostką jest, że po jednej stronie, w wy...5132626- Ciekawostką jest, że po jednej stronie, w wy...52358
4POLMistral-Large-2411PartialDevJeśli by się zgodził, to byłby z pewnością kro...4329614Jeśli by się zgodził, to byłby z pewnością kro...69458
....................................
99843POLAmazon-Nova-Lite-1.0PartialTestDla leniwych fanów muzyki mamy dobrą informacj...3120218Dla leniwych fanów muzyki mamy dobrą informacj...74512
99844POLAmazon-Nova-Pro-1.0RewrittenTestZ kontroli wynika, że wojskowe zarządy infrast...574080Według ostatnich informacji, kontrola przeprow...66489
99845POLGemini-Pro-1.5PartialTestTym samym apeluję do wszystkich obywateli tego...5943429Tym samym apeluję do wszystkich obywateli tego...99733
99846POLAmazon-Nova-Lite-1.0PartialTrainHurtowa cena ogórków jest niska, 'krótkie' kos...4327626Hurtowa cena ogórków jest niska, 'krótkie' kos...94649
99847POLAmazon-Nova-Lite-1.0PartialTestJak zwykle, ruszając w odwiedziny do naszych C...11376849Jak zwykle, ruszając w odwiedziny do naszych C...132939
\n", "

99848 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 99848,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Amazon-Nova-Pro-1.0\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Partial\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Train\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99847,\n \"samples\": [\n \"Wszystko to prawda, cho\\u0107 tak naprawd\\u0119 by\\u0142... Szale\\u0144cem? Cz\\u0142owiekiem pozbawionym uczu\\u0107 wy\\u017cszych? Mitomanem ulegaj\\u0105cym legendom, w kt\\u00f3rych nie by\\u0142o \\u017ad\\u017ab\\u0142a prawdy? Zakompleksionym facetem boj\\u0105cym si\\u0119 kobiet? Fanatykiem? Kuglarzem wierz\\u0105cym w gus\\u0142a i okultyzm? Nigdy si\\u0119 nie dowiemy.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 31,\n \"min\": 4,\n \"max\": 1857,\n \"num_unique_values\": 366,\n \"samples\": [\n 210\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 208,\n \"min\": 9,\n \"max\": 8974,\n \"num_unique_values\": 1646,\n \"samples\": [\n 910\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 957,\n \"num_unique_values\": 263,\n \"samples\": [\n 141\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99845,\n \"samples\": [\n \"Fina\\u0142owe pojedynki nie przynios\\u0142y emocji. W pi\\u0105tek Pomorzanin przegra\\u0142 na w\\u0142asnym boisku a\\u017c 1:6. W drugim meczu torunianie zacz\\u0119li obiecuj\\u0105co - po 90 sekundach wygrywali 1:0 po bramce Micha\\u0142a Kunklewskiego. Poznaniacy szybko wyr\\u00f3wnali, a potem z ka\\u017cd\\u0105 minut\\u0105 dominowali na boisku.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 38,\n \"min\": 1,\n \"max\": 4041,\n \"num_unique_values\": 373,\n \"samples\": [\n 311\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 252,\n \"min\": 6,\n \"max\": 13840,\n \"num_unique_values\": 1880,\n \"samples\": [\n 907\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 2 } ], "source": [ "import pandas as pd\n", "splits = {'Arabic': 'Data-v3.1/ARA-v3-1.csv', 'Chinese': 'Data-v3.1/ZHO-v3-1.csv', 'Czech': 'Data-v3.1/CES-v3-1.csv', 'Dutch': 'Data-v3.1/NLD-v3-1.csv', 'English': 'Data-v3.1/ENG-v3-1.csv', 'French': 'Data-v3.1/FRA-v3-1.csv', 'German': 'Data-v3.1/DEU-v3-1.csv', 'Greek': 'Data-v3.1/ELL-v3-1.csv', 'Hebrew': 'Data-v3.1/HEB-v3-1.csv', 'Hindi': 'Data-v3.1/HIN-v3-1.csv', 'Indonesian': 'Data-v3.1/IND-v3-1.csv', 'Italian': 'Data-v3.1/ITA-v3-1.csv', 'Japanese': 'Data-v3.1/JPN-v3-1.csv', 'Korean': 'Data-v3.1/KOR-v3-1.csv', 'Persian': 'Data-v3.1/PES-v3-1.csv', 'Polish': 'Data-v3.1/POL-v3-1.csv', 'Portuguese': 'Data-v3.1/POR-v3-1.csv', 'Romanian': 'Data-v3.1/RON-v3-1.csv', 'Russian': 'Data-v3.1/RUS-v3-1.csv', 'Spanish': 'Data-v3.1/SPA-v3-1.csv', 'Turkish': 'Data-v3.1/TUR-v3-1.csv', 'Vietnamese': 'Data-v3.1/VIE-v3-1.csv', 'Ukrainian': 'Data-v3.1/UKR-v3-1.csv'}\n", "df = pd.read_csv(\"hf://datasets/1024m/mMGTD-Corpus/\" + splits[\"Polish\"])\n", "df" ] }, { "cell_type": "code", "source": [ "df = df.sample(frac=1).reset_index(drop=True)" ], "metadata": { "id": "KIgwx1iCpC3f" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "df_train = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Train')]\n", "df_dev = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Dev')]\n", "df_test = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Test')]\n", "print(len(df_train))\n", "print(len(df_dev))\n", "print(len(df_test))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cVKBbVG9qDGF", "outputId": "8586e4e0-18d4-4db6-dc07-f82137dcf017" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "39941\n", "9986\n", "49921\n" ] } ] }, { "cell_type": "code", "source": [ "POL_train = df_train.copy()\n", "POL_dev = df_dev.copy()\n", "POL_test = df_test.copy()" ], "metadata": { "id": "1QWJPFozqFUh" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "df_train['id'] = 'POL' + df_train.index.astype(str) # Creating the 'id' column\n", "df_train = df_train.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_dev['id'] = 'POL' + df_dev.index.astype(str) # Creating the 'id' column\n", "df_dev = df_dev.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_test['id'] = 'POL' + df_test.index.astype(str) # Creating the 'id' column\n", "df_test = df_test.rename(columns={'Modified text': 'text', 'Split Location': 'label'})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bSlHXAnzqHmd", "outputId": "8f159048-482a-4fde-e424-b878b34e9dd6" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['id'] = 'POL' + df_train.index.astype(str) # Creating the 'id' column\n", ":3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_dev['id'] = 'POL' + df_dev.index.astype(str) # Creating the 'id' column\n", ":5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_test['id'] = 'POL' + df_test.index.astype(str) # Creating the 'id' column\n" ] } ] }, { "cell_type": "code", "source": [ "df_train = pd.concat([df_train, df_dev], ignore_index=True)" ], "metadata": { "id": "aGvboB0ZqJ8M" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "print(len(df_train))\n", "print(len(df_dev))\n", "print(len(df_test))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qIVYeup9qM5X", "outputId": "b050fc9e-dc19-43c0-a212-3f2411dccfca" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "49927\n", "9986\n", "49921\n" ] } ] }, { "cell_type": "code", "source": [ "df_train.to_json('POL_train.jsonl', orient='records', lines=True)\n", "df_test.to_json('POL_test.jsonl', orient='records', lines=True)" ], "metadata": { "id": "9javNVKDqO1j" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install torch\n", "!pip install transformers\n", "!pip install accelerate -U\n", "!pip install tqdm\n", "!pip install pytorch-crf\n", "!pip install sentencepiece" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C6wCkGRXqQpc", "outputId": "a327d946-c7e3-4a8a-9b25-2ead3febe890", "collapsed": true }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.5.1+cu124)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.17.0)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.5)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.10.0)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)\n", " Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)\n", " Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)\n", " Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-curand-cu12==10.3.5.147 (from torch)\n", " Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)\n", " Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)\n", " Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n", "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)\n", " Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.0)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n", "Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m119.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m88.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m60.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m40.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m17.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m43.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: nvidia-nvjitlink-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12\n", " Attempting uninstall: nvidia-nvjitlink-cu12\n", " Found existing installation: nvidia-nvjitlink-cu12 12.5.82\n", " Uninstalling nvidia-nvjitlink-cu12-12.5.82:\n", " Successfully uninstalled nvidia-nvjitlink-cu12-12.5.82\n", " Attempting uninstall: nvidia-curand-cu12\n", " Found existing installation: nvidia-curand-cu12 10.3.6.82\n", " Uninstalling nvidia-curand-cu12-10.3.6.82:\n", " Successfully uninstalled nvidia-curand-cu12-10.3.6.82\n", " Attempting uninstall: nvidia-cufft-cu12\n", " Found existing installation: nvidia-cufft-cu12 11.2.3.61\n", " Uninstalling nvidia-cufft-cu12-11.2.3.61:\n", " Successfully uninstalled nvidia-cufft-cu12-11.2.3.61\n", " Attempting uninstall: nvidia-cuda-runtime-cu12\n", " Found existing installation: nvidia-cuda-runtime-cu12 12.5.82\n", " Uninstalling nvidia-cuda-runtime-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82\n", " Attempting uninstall: nvidia-cuda-nvrtc-cu12\n", " Found existing installation: nvidia-cuda-nvrtc-cu12 12.5.82\n", " Uninstalling nvidia-cuda-nvrtc-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.5.82\n", " Attempting uninstall: nvidia-cuda-cupti-cu12\n", " Found existing installation: nvidia-cuda-cupti-cu12 12.5.82\n", " Uninstalling nvidia-cuda-cupti-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-cupti-cu12-12.5.82\n", " Attempting uninstall: nvidia-cublas-cu12\n", " Found existing installation: nvidia-cublas-cu12 12.5.3.2\n", " Uninstalling nvidia-cublas-cu12-12.5.3.2:\n", " Successfully uninstalled nvidia-cublas-cu12-12.5.3.2\n", " Attempting uninstall: nvidia-cusparse-cu12\n", " Found existing installation: nvidia-cusparse-cu12 12.5.1.3\n", " Uninstalling nvidia-cusparse-cu12-12.5.1.3:\n", " Successfully uninstalled nvidia-cusparse-cu12-12.5.1.3\n", " Attempting uninstall: nvidia-cudnn-cu12\n", " Found existing installation: nvidia-cudnn-cu12 9.3.0.75\n", " Uninstalling nvidia-cudnn-cu12-9.3.0.75:\n", " Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75\n", " Attempting uninstall: nvidia-cusolver-cu12\n", " Found existing installation: nvidia-cusolver-cu12 11.6.3.83\n", " Uninstalling nvidia-cusolver-cu12-11.6.3.83:\n", " Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83\n", "Successfully installed nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.48.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.17.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.28.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (24.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.0)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (2024.10.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (4.12.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2025.1.31)\n", "Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (1.3.0)\n", "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.11/dist-packages (from accelerate) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (24.2)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from accelerate) (5.9.5)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from accelerate) (6.0.2)\n", "Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (2.5.1+cu124)\n", "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.28.1)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.5.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (3.17.0)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2024.10.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.1.5)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (9.1.0.70)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.5.8)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (11.2.1.3)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (10.3.5.147)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (11.6.1.9)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.3.1.170)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (2.21.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.1.0)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch>=2.0.0->accelerate) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=2.0.0->accelerate) (3.0.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2025.1.31)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n", "Collecting pytorch-crf\n", " Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)\n", "Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)\n", "Installing collected packages: pytorch-crf\n", "Successfully installed pytorch-crf-0.7.2\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "os.makedirs(\"./runs/exp_seed\", exist_ok=True)\n", "os.makedirs(\"./runs/exp_seed/logs\", exist_ok=True)\n", "os.makedirs(\"./runs/exp_seed/xlmlongformerbase\", exist_ok=True)" ], "metadata": { "id": "if7zZ-egqSrE" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import json\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "from transformers.trainer_callback import TrainerState\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.nn.utils.rnn import pad_sequence\n", "import transformers\n", "from torch import nn\n", "from transformers import AutoModel, AutoConfig\n", "from torchcrf import CRF\n", "from torch.cuda.amp import autocast\n", "from transformers import Trainer\n", "from tqdm import tqdm\n", "import numpy as np\n", "import logging\n", "import glob\n", "from tqdm import tqdm\n", "from dataclasses import dataclass, field\n", "logging.basicConfig(level=logging.INFO)\n", "logger = logging.getLogger()\n", "@dataclass\n", "class ModelConfig:\n", " model_path = \"hyperonym/xlm-roberta-longformer-base-16384\"\n", " model_checkpoint_dir = \"./runs\"\n", "@dataclass\n", "class DatasetConfig:\n", " train_file = \"/content/POL_train.jsonl\"\n", " test_files = [\"/content/POL_test.jsonl\"]\n", "@dataclass\n", "class TrainingArgsConfig:\n", " do_train = False\n", " do_predict = False\n", " seed = 1024\n", " output_dir = \"./runs/exp_seed\"\n", " logging_steps = 160\n", " logging_dir = \"./runs/exp_seed\"\n", " num_train_epochs = 30\n", " per_device_train_batch_size = 12\n", " per_device_eval_batch_size = 12\n", " max_length = 2048\n", "model_args = ModelConfig()\n", "data_args = DatasetConfig()\n", "training_args = TrainingArgsConfig()\n", "class CRFTrainer(Trainer):\n", " def __init__(self, *args, **kwargs):\n", " super().__init__(*args, **kwargs)\n", " def compute_loss(self, model, inputs, return_outputs=False):\n", " print(inputs.keys())\n", " labels = inputs.pop(\"labels\")\n", " outputs = model(**inputs)\n", " emissions = outputs[0]\n", " mask = inputs[\"attention_mask\"]\n", " crf_loss = -model.crf(emissions, labels, mask=mask)\n", " return crf_loss\n", " def training_step(self, model, inputs):\n", " loss = self.compute_loss(model, inputs)\n", " return {\"loss\": loss, \"inputs\": inputs}\n", "class AutoModelCRF(nn.Module):\n", " def __init__(self, model_name_or_path, dropout=0.075):\n", " super(AutoModelCRF, self).__init__()\n", " self.config = AutoConfig.from_pretrained(model_name_or_path)\n", " self.num_labels = 2\n", " self.encoder = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, config=self.config, from_tf=True)\n", " self.dropout = nn.Dropout(dropout)\n", " self.linear = nn.Linear(self.config.hidden_size, self.num_labels)\n", " self.crf = CRF(self.num_labels, batch_first=True)\n", " def forward(self, input_ids, attention_mask, labels=None):\n", " inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}\n", " outputs = self.encoder(**inputs)\n", " seq_output = outputs[0]\n", " seq_output = self.dropout(seq_output)\n", " emission = self.linear(seq_output)\n", " if labels is None:\n", " tags = self.crf.decode(emission, attention_mask.byte())\n", " tags_padded = []\n", " for idx, sequence in enumerate(tags):\n", " if len(attention_mask[idx]) > len(sequence):\n", " tag_padded = sequence + [sequence[-1]]*(len(attention_mask[idx])-len(sequence))\n", " else:\n", " tag_padded = sequence\n", " tags_padded.append(tag_padded)\n", " out = np.array(tags_padded)\n", " return out\n", " else:\n", " crf_loss = -self.crf(emission, labels, mask=attention_mask.byte())\n", " return crf_loss\n", "def evaluate_position_difference(actual_position, predicted_position):\n", " return abs(actual_position - predicted_position)\n", "def get_start_position(sequence, mapping=None, token_level=True):\n", " if mapping is not None:\n", " mask = mapping != -100\n", " sequence = sequence[mask]\n", " mapping = mapping[mask]\n", " change_indices = np.where(np.diff(sequence) == 1)[0]\n", " if len(change_indices) > 0:\n", " value = change_indices[0] + 1\n", " else:\n", " value = 0 if sequence[0] == 1 else len(sequence) - 1\n", " if not token_level:\n", " value = mapping[value] if mapping is not None else value\n", " return value\n", "def evaluate_machine_start_position(labels, predictions, idx2word=None, token_level=False):\n", " actual_starts = []\n", " predicted_starts = []\n", " if not token_level and idx2word is None:\n", " raise ValueError(\"idx2word must be provided if evaluation is at word level (token_level=False)\")\n", " for idx in range(labels.shape[0]):\n", " predict, label, mapping = (predictions[idx][1:len(labels[idx])], labels[idx][1:len(labels[idx])], idx2word[idx][1:len(labels[idx])] if not token_level else None,)\n", " predicted_value = get_start_position(predict, mapping, token_level)\n", " actual_value = get_start_position(label, mapping, token_level)\n", " predicted_starts.append(predicted_value)\n", " actual_starts.append(actual_value)\n", " position_differences = [ evaluate_position_difference(actual, predict) for actual, predict in zip(actual_starts, predicted_starts) ]\n", " mean_position_difference = np.mean(position_differences)\n", " return mean_position_difference\n", "def compute_metrics(p):\n", " pred, labels = p\n", " mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True)\n", " return {\"mean_absolute_diff\": mean_absolute_diff,}\n", "def training_loop(model, optimizer, train_dataloader, device):\n", " model.train()\n", " total_loss = 0\n", " for step, batch in enumerate(tqdm(train_dataloader)):\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " labels = batch[\"labels\"].to(device)\n", " optimizer.zero_grad()\n", " loss = model(input_ids, attention_mask, labels=labels)\n", " loss.backward()\n", " optimizer.step()\n", " logger.info(f\"Step {step}: {loss.item():.4f}\")\n", " total_loss += loss.item()\n", " avg_loss = total_loss/len(train_dataloader)\n", " print(f\"Training loss: {avg_loss:.4f}\")\n", "def predict(model, test_dataloader, device):\n", " all_preds = []\n", " with torch.no_grad():\n", " for batch in tqdm(test_dataloader):\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " preds = model(input_ids, attention_mask)\n", " all_preds.extend(preds)\n", " out = np.array(all_preds)\n", " print(out.shape)\n", " return out\n", "def save_model(model_name, model, optimizer, epoch, output_dir): # train_mae, val_mae,\n", " if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)\n", " checkpoint = {'model_state_dict': model.state_dict(),'optimizer_state_dict': optimizer.state_dict()} # 'train_mae': train_mae,'val_mae': val_mae,\n", " model_name = model_name.replace(\"/\", \"-\")\n", " file_path = os.path.join(output_dir, f\"{model_name}-epoch-{epoch}.pt\")\n", " print(file_path)\n", " torch.save(checkpoint, file_path)\n", " logger.info(f\"Model has been saved successfully to {file_path}\")\n", "class Semeval_Data(torch.utils.data.Dataset):\n", " def __init__(self, data_path, model_name, max_length=512, inference=False, debug=False):\n", " with open(data_path, \"r\") as f:\n", " self.data = [json.loads(line) for line in f]\n", " self.inference = inference\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " self.max_length = max_length\n", " self.debug = debug\n", " def __len__(self):\n", " return len(self.data)\n", " def __getitem__(self, idx):\n", " text = self.data[idx][\"text\"]\n", " id = self.data[idx][\"id\"]\n", " label = None\n", " labels_available = \"label\" in self.data[idx]\n", " if labels_available:\n", " label = self.data[idx][\"label\"]\n", " labels = []\n", " corresponding_word = []\n", " tokens = []\n", " input_ids = []\n", " attention_mask = []\n", " for jdx, word in enumerate(text.split(\" \")):\n", " word_encoded = self.tokenizer.tokenize(word)\n", " sub_words = len(word_encoded)\n", " if labels_available:\n", " is_machine_text = 1 if jdx >= label else 0\n", " labels.extend([is_machine_text] * sub_words)\n", " corresponding_word.extend([jdx] * sub_words)\n", " tokens.extend(word_encoded)\n", " input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))\n", " attention_mask.extend([1] * sub_words)\n", " if len(input_ids) < self.max_length - 2:\n", " input_ids = ( [0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2) )\n", " if labels_available:\n", " labels = [0] + labels + [labels[-1]] * (self.max_length - len(labels) - 1)\n", " attention_mask = ( [1] + attention_mask + [1] + [0] * (self.max_length - len(attention_mask) - 2) )\n", " corresponding_word = ( [-100] + corresponding_word + [-100] * (self.max_length - len(corresponding_word) - 1) )\n", " tokens = ( [\"\"] + tokens + [\"\"] + [\"\"] * (self.max_length - len(tokens) - 2) )\n", " else:\n", " input_ids = [0] + input_ids[: self.max_length - 2] + [2]\n", " if labels_available:\n", " labels = [0] + labels[: self.max_length - 2] + [labels[self.max_length - 3]]\n", " corresponding_word = ( [-100] + corresponding_word[: self.max_length - 2] + [-100] )\n", " attention_mask = [1] + attention_mask[: self.max_length - 2] + [1]\n", " tokens = [\"\"] + tokens[: self.max_length - 2] + [\"\"]\n", " encoded = {}\n", " if labels_available:\n", " encoded[\"labels\"] = torch.tensor(labels)\n", " encoded[\"input_ids\"] = torch.tensor(input_ids)\n", " encoded[\"attention_mask\"] = torch.tensor(attention_mask)\n", " if labels_available:\n", " assert encoded[\"input_ids\"].shape == encoded[\"labels\"].shape\n", " if self.debug and not self.inference:\n", " encoded[\"partial_human_review\"] = \" \".join(text.split(\" \")[:label])\n", " if self.inference:\n", " encoded[\"text\"] = text\n", " encoded[\"id\"] = id\n", " encoded[\"corresponding_word\"] = corresponding_word\n", " return encoded\n", "if __name__ == \"__main__\":\n", " model_args = ModelConfig()\n", " data_args = DatasetConfig()\n", " training_args = TrainingArgsConfig()\n", " transformers.set_seed(training_args.seed)\n", " model_path = model_args.model_path\n", " model_checkpoint_dir = model_args.model_checkpoint_dir\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " model = AutoModelCRF(model_path).to(device)\n", " optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n", " train_set = Semeval_Data(data_args.train_file, model_path, max_length=training_args.max_length)\n", " train_dataloader = DataLoader(train_set, batch_size=training_args.per_device_train_batch_size, shuffle=True)\n", " train_eval_dataloader = DataLoader(train_set, batch_size=training_args.per_device_eval_batch_size, shuffle=False)\n", " if training_args.do_train:\n", " logger.info(\"Training...\")\n", " logger.info(\"*** Train Dataset ***\")\n", " logger.info(f\"Number of samples: {len(train_set)}\")\n", " num_train_epochs = training_args.num_train_epochs\n", " for epoch in tqdm(range(num_train_epochs)):\n", " training_loop(model, optimizer, train_dataloader, device)\n", " save_model(model_path, model, optimizer, epoch, model_checkpoint_dir) # ,train_mse ,val_mse" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 301, "referenced_widgets": [ "f832dabb01de4e1b97bd7c21c7e0e9c6", "d7193e0d76af476ea4873ca54986738c", "3a5fdf5c274a479594428c0867c91374", "7bc0cad0326f44e1b4ddc5ef13bf2518", "c98852c25a5842dabf169a2c74cf52b1", "c202f602bd1e405d9837a1bc2bce5d42", "69d7a592b912421db9eb32a41e141069", "28a64c5b804440cb886ea0fb53aa1877", "2a8f2e40a6df420f9ff1e1150054efcd", "8b13fcfdf7244095bcb8d95604105463", "31bb547c74d14810b98cd11222ab0b4e", "22c8ceff4cad4d2f9064b31fa8a119ac", "dc3be8d1937247c5b909f862b8fd6b1c", "a76f964b417b462f9faa50f3c672eb9c", "a485a5ea4801495cb8498f0cfbfd8f11", "a35ae39aca914af083536512e91be51c", "c83c9142abeb4db5bc6eeb1d893b69bc", "0fce3a68b4a04f4698b7c378f2431a58", "b68b93a049834ff881b4e85396c47129", "d829bd4541654278ad01bbea3462b2e0", "c2a4044283c24b0285bf32eb71d74824", "b69bfcb2a3de4104b02bb26cc1892613", "f0571fa7c0dc4502b6057eb6c1e0f1a1", "5c299c4c55124c3780b896428d2d36b2", "0c757cfe5eaf46b5a1e0f2ad8af2ad65", "0e3740b1cb904ddfad4b659e979c1768", "87540dfec4fe4d3ba60dd14050bcac90", "b47685ec5c8147eba4ea353344da87ca", "7694847c4d0e4defa9485f4bbbcc8b2b", "7bd649b1deaf48a9aa2a0dde5d3d92ab", "90d7a474658d4822ad7e2d23b52edf42", "c137c41aedba44578b1734f91c5da313", "cc09e9bb1c984cd49437736f8c6e6ba0", "8cae6715b19e4c46a3de982950045f31", "b3ad4244bd3d4def916013b9cdd0e529", "36795ecb440e44a294472dc8b81d0121", "c16066fd1c584b7aa9c11cf087deddaa", "7f3c219cb89b49e4ad653aafc9a9fd60", "7be5a2a5bfee4719bac6649842744cd7", "ae0853b3f4054cf08494eed2da4059f8", "d7cf7850b21745df946a44b36e227553", "867a96191cbe48d7a22be575d748f04e", "31157ef17afc4da293f61c7440bdf73f", "136be58a40e1430283ddbf04594b6433", "21e8778f26404971b06b174aeb46e0a4", "16cb57dfeda04dc693ce196fc5a2f9b2", "e0ecf0c5ea2b44e0b0331acee5172588", "0a4528fea6cf4be6af58cba8f55ae0a6", "6022886bee0945bbb79e1135495850b4", "95c32e816fc14c089164377e3dd1eac8", "0e80843dafc3438fb614eeb01be4650f", "e245e508678f46b99126ea7bff49bfbe", "0f94be2ff070471d9d5dc71661b3016d", "6a355e8ef31a4302a26b97024a7ac346", "c7e7ae5f09d043cebb2ed3ba566f6223" ] }, "id": "tXBLrJp0quLE", "outputId": "4d5a1076-9514-47a0-9864-82d1ddd59bed" }, "execution_count": 12, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "config.json: 0%| | 0.00/772 [00:00] 3.32G 18.9MB/s in 3m 6s \n", "\n", "2025-02-12 00:49:23 (18.3 MB/s) - ‘POL-xlm-longformer’ saved [3563459222/3563459222]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "model = AutoModelCRF(model_args.model_path).to(device)\n", "tokenizer = AutoTokenizer.from_pretrained(model_args.model_path)\n", "checkpoint = torch.load('POL-xlm-longformer')\n", "model.load_state_dict(checkpoint['model_state_dict'])\n", "model.eval()\n", "test_sets = []\n", "for test_file in data_args.test_files:\n", " test_set = Semeval_Data(test_file, model_args.model_path, max_length=training_args.max_length, inference=True)\n", " test_dataloader = DataLoader(test_set, batch_size=training_args.per_device_eval_batch_size, shuffle=False)\n", " test_sets.append(test_dataloader)\n", "logger.info(\"Predicting...\")\n", "logger.info(\"*** Test Datasets ***\")\n", "logger.info(f\"Number of sets: {len(test_sets)}\")\n", "for idx, test_set in enumerate(test_sets):\n", " logger.info(f\"Test Dataset {idx + 1}\")\n", " logger.info(f\"Number of samples: {len(test_set)}\")\n", " predictions = predict(model, test_set, device)\n", " corresponding_words = []\n", " ids = []\n", " for batch in test_set:\n", " corr_word_tensors = [torch.tensor(cw) for cw in batch['corresponding_word']]\n", " corr_word_padded = torch.nn.utils.rnn.pad_sequence(corr_word_tensors, batch_first=True, padding_value=-100)\n", " corr_word = np.transpose(corr_word_padded.numpy(), (1, 0))\n", " ids.extend(batch[\"id\"])\n", " corresponding_words.extend(corr_word)\n", " corresponding_words = np.array(corresponding_words)\n", " logger.info(\"Predictions completed!\")\n", " df_ids = []\n", " df_labels = []\n", " for id, pred, corr_word in zip(ids, predictions, corresponding_words):\n", " df_ids.append(id)\n", " df_labels.append(get_start_position(pred, corr_word, token_level=False))\n", " df = pd.DataFrame({\"id\": df_ids, \"label\": df_labels})\n", " file_name = os.path.basename(test_file)\n", " file_dirs = os.path.join(training_args.output_dir, \"predictions\")\n", " os.makedirs(file_dirs, exist_ok=True)\n", " file_path = os.path.join(file_dirs, file_name)\n", " records = df.to_dict(\"records\")\n", " with open(file_path, \"w\") as f:\n", " for record in records:\n", " f.write(json.dumps(record) + \"\\n\")" ], "metadata": { "id": "fIMLFzDxrVSA", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f05a7e62-abc0-4ca2-9f2a-2fcf2682487a" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "All TF 2.0 model weights were used when initializing LongformerModel.\n", "\n", "All the weights of LongformerModel were initialized from the TF 2.0 model.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use LongformerModel for predictions without further training.\n", ":4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " checkpoint = torch.load('POL-xlm-longformer')\n", " 0%| | 0/4161 [00:00:22: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " corr_word_tensors = [torch.tensor(cw) for cw in batch['corresponding_word']]\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install jsonlines\n", "import pandas as pd\n", "import jsonlines\n", "jsonl_file_path = '/content/runs/exp_seed/predictions/POL_test.jsonl'\n", "def display_jsonl_as_dataframe(file_path):\n", " data = []\n", " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " data.append(obj)\n", " df = pd.DataFrame(data)\n", " return df\n", "jsonl_df = display_jsonl_as_dataframe(jsonl_file_path)\n", "jsonl_df" ], "metadata": { "id": "yutpCG-Drcjn", "colab": { "base_uri": "https://localhost:8080/", "height": 527 }, "outputId": "caf8d2eb-0c08-461a-aeeb-25b708a48144" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting jsonlines\n", " Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n", "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.11/dist-packages (from jsonlines) (25.1.0)\n", "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", "Installing collected packages: jsonlines\n", "Successfully installed jsonlines-4.0.0\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " id label\n", "0 POL0 17\n", "1 POL1 26\n", "2 POL3 24\n", "3 POL4 37\n", "4 POL8 25\n", "... ... ...\n", "49916 POL99838 23\n", "49917 POL99839 36\n", "49918 POL99841 12\n", "49919 POL99845 28\n", "49920 POL99846 19\n", "\n", "[49921 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel
0POL017
1POL126
2POL324
3POL437
4POL825
.........
49916POL9983823
49917POL9983936
49918POL9984112
49919POL9984528
49920POL9984619
\n", "

49921 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "jsonl_df", "summary": "{\n \"name\": \"jsonl_df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"POL45784\",\n \"POL26040\",\n \"POL86331\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138,\n 207,\n 297\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "jsonl_file_path = '/content/POL_test.jsonl'\n", "def display_jsonl_as_dataframe(file_path):\n", " data = []\n", " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " data.append(obj)\n", " df = pd.DataFrame(data)\n", " return df\n", "jsonl_df_gold = display_jsonl_as_dataframe(jsonl_file_path)\n", "jsonl_df_gold" ], "metadata": { "id": "nLm2KGliriEN", "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "outputId": "5da47c69-7467-4ffa-ada6-be1d55259ae8" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count label \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " text New Word Count \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 \n", "\n", " New Char Count id \n", "0 705 POL0 \n", "1 618 POL1 \n", "2 404 POL3 \n", "3 1409 POL4 \n", "4 566 POL8 \n", "... ... ... \n", "49916 707 POL99838 \n", "49917 660 POL99839 \n", "49918 1033 POL99841 \n", "49919 850 POL99845 \n", "49920 478 POL99846 \n", "\n", "[49921 rows x 12 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountlabeltextNew Word CountNew Char Countid
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95705POL0
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95618POL1
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58404POL3
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...2111409POL4
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76566POL8
.......................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...91707POL99838
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...94660POL99839
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...1431033POL99841
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...109850POL99845
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77478POL99846
\n", "

49921 rows × 12 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "jsonl_df_gold", "summary": "{\n \"name\": \"jsonl_df_gold\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Partial\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na ulicy w towarzystwie jakich\\u015b os\\u00f3b. To mnie chroni przed lud\\u017ami, kt\\u00f3rzy nieustannie chc\\u0105 sobie ze mn\\u0105 robi\\u0107 zdj\\u0119cia lub prosz\\u0105 o autograf' - doda\\u0142.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30,\n \"min\": 4,\n \"max\": 812,\n \"num_unique_values\": 318,\n \"samples\": [\n 130\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 207,\n \"min\": 9,\n \"max\": 5216,\n \"num_unique_values\": 1437,\n \"samples\": [\n 172\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 36,\n \"min\": 1,\n \"max\": 2417,\n \"num_unique_values\": 329,\n \"samples\": [\n 67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 250,\n \"min\": 6,\n \"max\": 13840,\n \"num_unique_values\": 1690,\n \"samples\": [\n 934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"POL45784\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "jsonl_df = jsonl_df.rename(columns={'label': 'label_pred'})\n", "jsonl_df_gold = jsonl_df_gold.rename(columns={'label': 'label_gold'})\n", "merged_df = pd.merge(jsonl_df[['id', 'label_pred']], jsonl_df_gold[['id','text','label_gold']], on='id')\n", "merged_df" ], "metadata": { "id": "wFmwSZsirsFY", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "6e83aca2-d62e-4a6f-de1d-80f9ea7ba7ad" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id label_pred \\\n", "0 POL0 17 \n", "1 POL1 26 \n", "2 POL3 24 \n", "3 POL4 37 \n", "4 POL8 25 \n", "... ... ... \n", "49916 POL99838 23 \n", "49917 POL99839 36 \n", "49918 POL99841 12 \n", "49919 POL99845 28 \n", "49920 POL99846 19 \n", "\n", " text label_gold \n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 17 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 30 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 24 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 37 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 25 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 26 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 36 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 17 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 28 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 19 \n", "\n", "[49921 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel_predtextlabel_gold
0POL017Godne polecenia ( choć paliwożerne ) są turbod...17
1POL126Natalia Partyka też przegrała 3:4 , tyle że z ...30
2POL324Niemcy chcieli wybudować trzy takie wjazdy, uk...24
3POL437Opiekowała się rodzinami oficerów , którzy zgi...37
4POL825W minionym tygodniu gminne dożynki odbyły się ...25
...............
49916POL9983823Efektownymi porównaniami rzucali też szefowie ...26
49917POL9983936Film - który wywołał oburzenie krytyków w Cann...36
49918POL9984112Pierwszy trailer gry Dead Island zrobił wielką...17
49919POL9984528O ujawnienie jego treści od dawna walczą organ...28
49920POL998461975 min. - na boisku walka w środku pola, a żół...19
\n", "

49921 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"POL45784\",\n \"POL26040\",\n \"POL86331\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138,\n 207,\n 297\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\",\n \"Sporej liczbie Amerykan\\u00f3w to si\\u0119 znudzi\\u0142o. Eksperci zgadzaj\\u0105 si\\u0119, \\u017ce b\\u0142\\u0119dem by\\u0142a monumentalna podr\\u00f3\\u017c Obamy po Bliskim Wschodzie. Nie przynios\\u0142a ona \\u017cadnych wymiernych korzy\\u015bci, a jedynie pog\\u0142\\u0119bi\\u0142a istniej\\u0105ce napi\\u0119cia. Wiele g\\u0142os\\u00f3w krytyki podnosi si\\u0119 r\\u00f3wnie\\u017c w kwestii polityki wewn\\u0119trznej. Rosn\\u0105ce bezrobocie i stagnacja gospodarcza s\\u0105 powa\\u017cnym zagro\\u017ceniem dla stabilno\\u015bci kraju. Amerykanie oczekuj\\u0105 konkretnych dzia\\u0142a\\u0144, a nie pustych obietnic. Czy obecna administracja b\\u0119dzie w stanie sprosta\\u0107 tym wyzwaniom? Czas poka\\u017ce.\\n\",\n \"B., kt\\u00f3ra w banku mia\\u0142a dost\\u0119p niemal do wszystkiego, nie przypadkiem wybra\\u0142a konto dwojga suwalskich emeryt\\u00f3w. Bo przez kilka lat wp\\u0142ywa\\u0142y tam pieni\\u0105dze, a nikt nie pobra\\u0142 ani z\\u0142ot\\u00f3wki. Co mog\\u0142a pomy\\u015ble\\u0107 nieuczciwa pracownica? \\u017be ludzie zmarli, Here's a continuation in Polish:\\n\\n\\u017ce nikt nie b\\u0119dzie sprawdza\\u0142 tego konta, i \\u017ce mo\\u017ce je spokojnie opr\\u00f3\\u017cni\\u0107. Wiedzia\\u0142a, \\u017ce ryzyko wpadki jest minimalne. Przez d\\u0142ugi czas wydawa\\u0142o jej si\\u0119, \\u017ce jej plan jest doskona\\u0142y. Nikt nie mia\\u0142 poj\\u0119cia o jej machinacjach, a pieni\\u0105dze systematycznie znika\\u0142y z konta emeryt\\u00f3w.\\n\\nJednak los lubi p\\u0142ata\\u0107 figle. Pewnego dnia, zupe\\u0142nie nieoczekiwanie, sprawa si\\u0119 wyda\\u0142a. Jeden z krewnych emeryt\\u00f3w postan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64,\n 161,\n 219\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "merged_df['diff'] = (merged_df['label_pred'] - merged_df['label_gold']).abs()\n", "merged_df" ], "metadata": { "id": "Lh8HQBtIrvFx", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "a2997e84-119e-46a4-d7b0-1dcd3dfb3dcd" }, "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id label_pred \\\n", "0 POL0 17 \n", "1 POL1 26 \n", "2 POL3 24 \n", "3 POL4 37 \n", "4 POL8 25 \n", "... ... ... \n", "49916 POL99838 23 \n", "49917 POL99839 36 \n", "49918 POL99841 12 \n", "49919 POL99845 28 \n", "49920 POL99846 19 \n", "\n", " text label_gold diff \n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 17 0 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 30 4 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 24 0 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 37 0 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 25 0 \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 26 3 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 36 0 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 17 5 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 28 0 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 19 0 \n", "\n", "[49921 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel_predtextlabel_golddiff
0POL017Godne polecenia ( choć paliwożerne ) są turbod...170
1POL126Natalia Partyka też przegrała 3:4 , tyle że z ...304
2POL324Niemcy chcieli wybudować trzy takie wjazdy, uk...240
3POL437Opiekowała się rodzinami oficerów , którzy zgi...370
4POL825W minionym tygodniu gminne dożynki odbyły się ...250
..................
49916POL9983823Efektownymi porównaniami rzucali też szefowie ...263
49917POL9983936Film - który wywołał oburzenie krytyków w Cann...360
49918POL9984112Pierwszy trailer gry Dead Island zrobił wielką...175
49919POL9984528O ujawnienie jego treści od dawna walczą organ...280
49920POL998461975 min. - na boisku walka w środku pola, a żół...190
\n", "

49921 rows × 5 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"POL45784\",\n \"POL26040\",\n \"POL86331\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138,\n 207,\n 297\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\",\n \"Sporej liczbie Amerykan\\u00f3w to si\\u0119 znudzi\\u0142o. Eksperci zgadzaj\\u0105 si\\u0119, \\u017ce b\\u0142\\u0119dem by\\u0142a monumentalna podr\\u00f3\\u017c Obamy po Bliskim Wschodzie. Nie przynios\\u0142a ona \\u017cadnych wymiernych korzy\\u015bci, a jedynie pog\\u0142\\u0119bi\\u0142a istniej\\u0105ce napi\\u0119cia. Wiele g\\u0142os\\u00f3w krytyki podnosi si\\u0119 r\\u00f3wnie\\u017c w kwestii polityki wewn\\u0119trznej. Rosn\\u0105ce bezrobocie i stagnacja gospodarcza s\\u0105 powa\\u017cnym zagro\\u017ceniem dla stabilno\\u015bci kraju. Amerykanie oczekuj\\u0105 konkretnych dzia\\u0142a\\u0144, a nie pustych obietnic. Czy obecna administracja b\\u0119dzie w stanie sprosta\\u0107 tym wyzwaniom? Czas poka\\u017ce.\\n\",\n \"B., kt\\u00f3ra w banku mia\\u0142a dost\\u0119p niemal do wszystkiego, nie przypadkiem wybra\\u0142a konto dwojga suwalskich emeryt\\u00f3w. Bo przez kilka lat wp\\u0142ywa\\u0142y tam pieni\\u0105dze, a nikt nie pobra\\u0142 ani z\\u0142ot\\u00f3wki. Co mog\\u0142a pomy\\u015ble\\u0107 nieuczciwa pracownica? \\u017be ludzie zmarli, Here's a continuation in Polish:\\n\\n\\u017ce nikt nie b\\u0119dzie sprawdza\\u0142 tego konta, i \\u017ce mo\\u017ce je spokojnie opr\\u00f3\\u017cni\\u0107. Wiedzia\\u0142a, \\u017ce ryzyko wpadki jest minimalne. Przez d\\u0142ugi czas wydawa\\u0142o jej si\\u0119, \\u017ce jej plan jest doskona\\u0142y. Nikt nie mia\\u0142 poj\\u0119cia o jej machinacjach, a pieni\\u0105dze systematycznie znika\\u0142y z konta emeryt\\u00f3w.\\n\\nJednak los lubi p\\u0142ata\\u0107 figle. Pewnego dnia, zupe\\u0142nie nieoczekiwanie, sprawa si\\u0119 wyda\\u0142a. Jeden z krewnych emeryt\\u00f3w postan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64,\n 161,\n 219\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 370,\n \"num_unique_values\": 193,\n \"samples\": [\n 61,\n 121,\n 92\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "merged_df['id'] = merged_df['id'].str[3:].astype(int)\n", "merged_df" ], "metadata": { "id": "zZf3ctI2rwvS", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "14bd818a-f8fe-4d49-a09b-11336ce3fa6f" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id label_pred text \\\n", "0 0 17 Godne polecenia ( choć paliwożerne ) są turbod... \n", "1 1 26 Natalia Partyka też przegrała 3:4 , tyle że z ... \n", "2 3 24 Niemcy chcieli wybudować trzy takie wjazdy, uk... \n", "3 4 37 Opiekowała się rodzinami oficerów , którzy zgi... \n", "4 8 25 W minionym tygodniu gminne dożynki odbyły się ... \n", "... ... ... ... \n", "49916 99838 23 Efektownymi porównaniami rzucali też szefowie ... \n", "49917 99839 36 Film - który wywołał oburzenie krytyków w Cann... \n", "49918 99841 12 Pierwszy trailer gry Dead Island zrobił wielką... \n", "49919 99845 28 O ujawnienie jego treści od dawna walczą organ... \n", "49920 99846 19 75 min. - na boisku walka w środku pola, a żół... \n", "\n", " label_gold diff \n", "0 17 0 \n", "1 30 4 \n", "2 24 0 \n", "3 37 0 \n", "4 25 0 \n", "... ... ... \n", "49916 26 3 \n", "49917 36 0 \n", "49918 17 5 \n", "49919 28 0 \n", "49920 19 0 \n", "\n", "[49921 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel_predtextlabel_golddiff
0017Godne polecenia ( choć paliwożerne ) są turbod...170
1126Natalia Partyka też przegrała 3:4 , tyle że z ...304
2324Niemcy chcieli wybudować trzy takie wjazdy, uk...240
3437Opiekowała się rodzinami oficerów , którzy zgi...370
4825W minionym tygodniu gminne dożynki odbyły się ...250
..................
499169983823Efektownymi porównaniami rzucali też szefowie ...263
499179983936Film - który wywołał oburzenie krytyków w Cann...360
499189984112Pierwszy trailer gry Dead Island zrobił wielką...175
499199984528O ujawnienie jego treści od dawna walczą organ...280
49920998461975 min. - na boisku walka w środku pola, a żół...190
\n", "

49921 rows × 5 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28766,\n \"min\": 0,\n \"max\": 99846,\n \"num_unique_values\": 49921,\n \"samples\": [\n 45784,\n 26040,\n 86331\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138,\n 207,\n 297\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\",\n \"Sporej liczbie Amerykan\\u00f3w to si\\u0119 znudzi\\u0142o. Eksperci zgadzaj\\u0105 si\\u0119, \\u017ce b\\u0142\\u0119dem by\\u0142a monumentalna podr\\u00f3\\u017c Obamy po Bliskim Wschodzie. Nie przynios\\u0142a ona \\u017cadnych wymiernych korzy\\u015bci, a jedynie pog\\u0142\\u0119bi\\u0142a istniej\\u0105ce napi\\u0119cia. Wiele g\\u0142os\\u00f3w krytyki podnosi si\\u0119 r\\u00f3wnie\\u017c w kwestii polityki wewn\\u0119trznej. Rosn\\u0105ce bezrobocie i stagnacja gospodarcza s\\u0105 powa\\u017cnym zagro\\u017ceniem dla stabilno\\u015bci kraju. Amerykanie oczekuj\\u0105 konkretnych dzia\\u0142a\\u0144, a nie pustych obietnic. Czy obecna administracja b\\u0119dzie w stanie sprosta\\u0107 tym wyzwaniom? Czas poka\\u017ce.\\n\",\n \"B., kt\\u00f3ra w banku mia\\u0142a dost\\u0119p niemal do wszystkiego, nie przypadkiem wybra\\u0142a konto dwojga suwalskich emeryt\\u00f3w. Bo przez kilka lat wp\\u0142ywa\\u0142y tam pieni\\u0105dze, a nikt nie pobra\\u0142 ani z\\u0142ot\\u00f3wki. Co mog\\u0142a pomy\\u015ble\\u0107 nieuczciwa pracownica? \\u017be ludzie zmarli, Here's a continuation in Polish:\\n\\n\\u017ce nikt nie b\\u0119dzie sprawdza\\u0142 tego konta, i \\u017ce mo\\u017ce je spokojnie opr\\u00f3\\u017cni\\u0107. Wiedzia\\u0142a, \\u017ce ryzyko wpadki jest minimalne. Przez d\\u0142ugi czas wydawa\\u0142o jej si\\u0119, \\u017ce jej plan jest doskona\\u0142y. Nikt nie mia\\u0142 poj\\u0119cia o jej machinacjach, a pieni\\u0105dze systematycznie znika\\u0142y z konta emeryt\\u00f3w.\\n\\nJednak los lubi p\\u0142ata\\u0107 figle. Pewnego dnia, zupe\\u0142nie nieoczekiwanie, sprawa si\\u0119 wyda\\u0142a. Jeden z krewnych emeryt\\u00f3w postan\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64,\n 161,\n 219\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 370,\n \"num_unique_values\": 193,\n \"samples\": [\n 61,\n 121,\n 92\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "merged_df = POL_test.merge(merged_df, left_index=True, right_on='id', how='outer')\n", "merged_df" ], "metadata": { "id": "yzQw_jhDr1-E", "colab": { "base_uri": "https://localhost:8080/", "height": 822 }, "outputId": "1f03b716-06de-4d4b-ae76-4c0935491265" }, "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count Split Location \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " Modified text New Word Count \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 \n", "\n", " New Char Count id label_pred \\\n", "0 705 0 17 \n", "1 618 1 26 \n", "2 404 3 24 \n", "3 1409 4 37 \n", "4 566 8 25 \n", "... ... ... ... \n", "49916 707 99838 23 \n", "49917 660 99839 36 \n", "49918 1033 99841 12 \n", "49919 850 99845 28 \n", "49920 478 99846 19 \n", "\n", " text label_gold diff \n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 17 0 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 30 4 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 24 0 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 37 0 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 25 0 \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 26 3 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 36 0 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 17 5 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 28 0 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 19 0 \n", "\n", "[49921 rows x 16 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiff
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95705017Godne polecenia ( choć paliwożerne ) są turbod...170
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95618126Natalia Partyka też przegrała 3:4 , tyle że z ...304
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58404324Niemcy chcieli wybudować trzy takie wjazdy, uk...240
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...2111409437Opiekowała się rodzinami oficerów , którzy zgi...370
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76566825W minionym tygodniu gminne dożynki odbyły się ...250
...................................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...917079983823Efektownymi porównaniami rzucali też szefowie ...263
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...946609983936Film - który wywołał oburzenie krytyków w Cann...360
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...14310339984112Pierwszy trailer gry Dead Island zrobił wielką...175
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...1098509984528O ujawnienie jego treści od dawna walczą organ...280
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77478998461975 min. - na boisku walka w środku pola, a żół...190
\n", "

49921 rows × 16 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Partial\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na ulicy w towarzystwie jakich\\u015b os\\u00f3b. To mnie chroni przed lud\\u017ami, kt\\u00f3rzy nieustannie chc\\u0105 sobie ze mn\\u0105 robi\\u0107 zdj\\u0119cia lub prosz\\u0105 o autograf' - doda\\u0142.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30,\n \"min\": 4,\n \"max\": 812,\n \"num_unique_values\": 318,\n \"samples\": [\n 130\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 207,\n \"min\": 9,\n \"max\": 5216,\n \"num_unique_values\": 1437,\n \"samples\": [\n 172\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 36,\n \"min\": 1,\n \"max\": 2417,\n \"num_unique_values\": 329,\n \"samples\": [\n 67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 250,\n \"min\": 6,\n \"max\": 13840,\n \"num_unique_values\": 1690,\n \"samples\": [\n 934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28766,\n \"min\": 0,\n \"max\": 99846,\n \"num_unique_values\": 49921,\n \"samples\": [\n 45784\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 370,\n \"num_unique_values\": 193,\n \"samples\": [\n 61\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "df = merged_df.copy()\n", "tokenizer = AutoTokenizer.from_pretrained(\"hyperonym/xlm-roberta-longformer-base-16384\") # USE SAME TOKENIZER AS USED IN TRAINING\n", "def check_split_position(row):\n", " text = row['Modified text']\n", " words = text.split()\n", " cumulative_tokens = 0\n", " for i in range(row['Split Location']): # Assuming Split Location is 1-based index\n", " tokens = tokenizer.tokenize(words[i])\n", " cumulative_tokens += len(tokens)\n", " if cumulative_tokens > 2048: # Check if we've already passed 2048 tokens\n", " return \"Outside\"\n", " return \"Inside\"\n", "df['Token Limit Check'] = df.apply(check_split_position, axis=1)\n", "df" ], "metadata": { "id": "n3EtjBRXr53j", "colab": { "base_uri": "https://localhost:8080/", "height": 909 }, "outputId": "27ceb0f4-c757-4183-ee7a-01f946b788dd" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count Split Location \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " Modified text New Word Count \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 \n", "\n", " New Char Count id label_pred \\\n", "0 705 0 17 \n", "1 618 1 26 \n", "2 404 3 24 \n", "3 1409 4 37 \n", "4 566 8 25 \n", "... ... ... ... \n", "49916 707 99838 23 \n", "49917 660 99839 36 \n", "49918 1033 99841 12 \n", "49919 850 99845 28 \n", "49920 478 99846 19 \n", "\n", " text label_gold diff \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 17 0 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 30 4 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 24 0 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 37 0 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 25 0 \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 26 3 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 36 0 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 17 5 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 28 0 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 19 0 \n", "\n", " Token Limit Check \n", "0 Inside \n", "1 Inside \n", "2 Inside \n", "3 Inside \n", "4 Inside \n", "... ... \n", "49916 Inside \n", "49917 Inside \n", "49918 Inside \n", "49919 Inside \n", "49920 Inside \n", "\n", "[49921 rows x 17 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiffToken Limit Check
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95705017Godne polecenia ( choć paliwożerne ) są turbod...170Inside
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95618126Natalia Partyka też przegrała 3:4 , tyle że z ...304Inside
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58404324Niemcy chcieli wybudować trzy takie wjazdy, uk...240Inside
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...2111409437Opiekowała się rodzinami oficerów , którzy zgi...370Inside
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76566825W minionym tygodniu gminne dożynki odbyły się ...250Inside
......................................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...917079983823Efektownymi porównaniami rzucali też szefowie ...263Inside
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...946609983936Film - który wywołał oburzenie krytyków w Cann...360Inside
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...14310339984112Pierwszy trailer gry Dead Island zrobił wielką...175Inside
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...1098509984528O ujawnienie jego treści od dawna walczą organ...280Inside
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77478998461975 min. - na boisku walka w środku pola, a żół...190Inside
\n", "

49921 rows × 17 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Partial\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na ulicy w towarzystwie jakich\\u015b os\\u00f3b. To mnie chroni przed lud\\u017ami, kt\\u00f3rzy nieustannie chc\\u0105 sobie ze mn\\u0105 robi\\u0107 zdj\\u0119cia lub prosz\\u0105 o autograf' - doda\\u0142.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30,\n \"min\": 4,\n \"max\": 812,\n \"num_unique_values\": 318,\n \"samples\": [\n 130\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 207,\n \"min\": 9,\n \"max\": 5216,\n \"num_unique_values\": 1437,\n \"samples\": [\n 172\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 36,\n \"min\": 1,\n \"max\": 2417,\n \"num_unique_values\": 329,\n \"samples\": [\n 67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 250,\n \"min\": 6,\n \"max\": 13840,\n \"num_unique_values\": 1690,\n \"samples\": [\n 934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28766,\n \"min\": 0,\n \"max\": 99846,\n \"num_unique_values\": 49921,\n \"samples\": [\n 45784\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 370,\n \"num_unique_values\": 193,\n \"samples\": [\n 61\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Token Limit Check\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Inside\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "df['Token Limit Check'].value_counts()" ], "metadata": { "id": "gWfUnO17r8zb", "colab": { "base_uri": "https://localhost:8080/", "height": 146 }, "outputId": "e1affdc5-820a-4e04-bf84-0d017b631ef7" }, "execution_count": 22, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Token Limit Check\n", "Inside 49921\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
Token Limit Check
Inside49921
\n", "

" ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "code", "source": [ "df['Split Location'].max()" ], "metadata": { "id": "HdNmbX6yr_Lv", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "8dfe426e-6e25-4825-ebba-0cf68cca15e4" }, "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "375" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "source": [ "# prompt: 2 new columns in df_demo as series/list of zeroes and ones : WORDS_REAL : length is row's Word Count, start with row's Split Location number of zeroes and end with ones : WORDS_PRED : lenght is rows' Word Count , start with row's label_pred number of zeroes and end wit ones\n", "def create_word_series(row, column_name):\n", " word_count = row['New Word Count']\n", " split_location = row[column_name]\n", " series = [0] * split_location + [1] * (word_count - split_location)\n", " return series\n", "df['WORDS_REAL'] = df.apply(create_word_series, axis=1, args=('Split Location',))\n", "df['WORDS_PRED'] = df.apply(create_word_series, axis=1, args=('label_pred',))\n", "df" ], "metadata": { "id": "R6waU4p-sCcV", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "ca977add-e893-415a-8fb2-db28f4ebaed3" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count Split Location \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " Modified text New Word Count \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 \n", "\n", " New Char Count id label_pred \\\n", "0 705 0 17 \n", "1 618 1 26 \n", "2 404 3 24 \n", "3 1409 4 37 \n", "4 566 8 25 \n", "... ... ... ... \n", "49916 707 99838 23 \n", "49917 660 99839 36 \n", "49918 1033 99841 12 \n", "49919 850 99845 28 \n", "49920 478 99846 19 \n", "\n", " text label_gold diff \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 17 0 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 30 4 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 24 0 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 37 0 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 25 0 \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 26 3 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 36 0 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 17 5 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 28 0 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 19 0 \n", "\n", " Token Limit Check WORDS_REAL \\\n", "0 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... ... \n", "49916 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49917 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49918 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49919 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49920 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED \n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... \n", "49916 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49917 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49918 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ... \n", "49919 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49920 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", "[49921 rows x 19 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiffToken Limit CheckWORDS_REALWORDS_PRED
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95705017Godne polecenia ( choć paliwożerne ) są turbod...170Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95618126Natalia Partyka też przegrała 3:4 , tyle że z ...304Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58404324Niemcy chcieli wybudować trzy takie wjazdy, uk...240Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...2111409437Opiekowała się rodzinami oficerów , którzy zgi...370Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76566825W minionym tygodniu gminne dożynki odbyły się ...250Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
............................................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...917079983823Efektownymi porównaniami rzucali też szefowie ...263Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...946609983936Film - który wywołał oburzenie krytyków w Cann...360Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...14310339984112Pierwszy trailer gry Dead Island zrobił wielką...175Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...1098509984528O ujawnienie jego treści od dawna walczą organ...280Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77478998461975 min. - na boisku walka w środku pola, a żół...190Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
\n", "

49921 rows × 19 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 49921,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POL\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Partial\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49921,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na ulicy w towarzystwie jakich\\u015b os\\u00f3b. To mnie chroni przed lud\\u017ami, kt\\u00f3rzy nieustannie chc\\u0105 sobie ze mn\\u0105 robi\\u0107 zdj\\u0119cia lub prosz\\u0105 o autograf' - doda\\u0142.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30,\n \"min\": 4,\n \"max\": 812,\n \"num_unique_values\": 318,\n \"samples\": [\n 130\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 207,\n \"min\": 9,\n \"max\": 5216,\n \"num_unique_values\": 1437,\n \"samples\": [\n 172\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 36,\n \"min\": 1,\n \"max\": 2417,\n \"num_unique_values\": 329,\n \"samples\": [\n 67\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 250,\n \"min\": 6,\n \"max\": 13840,\n \"num_unique_values\": 1690,\n \"samples\": [\n 934\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28766,\n \"min\": 0,\n \"max\": 99846,\n \"num_unique_values\": 49921,\n \"samples\": [\n 45784\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20,\n \"min\": 0,\n \"max\": 301,\n \"num_unique_values\": 218,\n \"samples\": [\n 138\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49920,\n \"samples\": [\n \"Bardziej uci\\u0105\\u017cliwa, ni\\u017c pogr\\u00f3\\u017cki, Palikotowi wydaje si\\u0119 jego popularno\\u015b\\u0107. Pose\\u0142 wyzna\\u0142, \\u017ce nigdy nie chodzi po ulicy sam. 'Zawsze staram si\\u0119 by\\u0107 na Here's a continuation in Polish:\\n\\n\\\"...bie\\u017c\\u0105co i mie\\u0107 ochron\\u0119\\\" - powiedzia\\u0142 w jednym z wywiad\\u00f3w. Jego s\\u0142owa wskazuj\\u0105, \\u017ce mimo pozornej pewno\\u015bci siebie, obawia si\\u0119 konsekwencji swojej kontrowersyjnej dzia\\u0142alno\\u015bci publicznej. Politycy cz\\u0119sto spotykaj\\u0105 si\\u0119 z r\\u00f3\\u017cnymi formami nacisku i zagro\\u017ce\\u0144, ale w przypadku Palikota sytuacja wydaje si\\u0119 szczeg\\u00f3lnie napi\\u0119ta.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 22,\n \"min\": 0,\n \"max\": 375,\n \"num_unique_values\": 233,\n \"samples\": [\n 64\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 15,\n \"min\": 0,\n \"max\": 370,\n \"num_unique_values\": 193,\n \"samples\": [\n 61\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Token Limit Check\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Inside\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORDS_REAL\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORDS_PRED\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "# prompt: 4 new columns : ROW_TP, ROW_FP, ROW_TN , ROW_FN : based on zeroes and ones in WORDS_PRED , WORDS_REAL . note : lenght of series is diff in each row\n", "def calculate_metrics(row):\n", " tp = 0\n", " fp = 0\n", " tn = 0\n", " fn = 0\n", " for i in range(len(row['WORDS_REAL'])):\n", " if row['WORDS_REAL'][i] == 1 and row['WORDS_PRED'][i] == 1:\n", " tp += 1\n", " elif row['WORDS_REAL'][i] == 0 and row['WORDS_PRED'][i] == 1:\n", " fp += 1\n", " elif row['WORDS_REAL'][i] == 0 and row['WORDS_PRED'][i] == 0:\n", " tn += 1\n", " elif row['WORDS_REAL'][i] == 1 and row['WORDS_PRED'][i] == 0:\n", " fn += 1\n", " return tp, fp, tn, fn\n", "df[['ROW_TP', 'ROW_FP', 'ROW_TN', 'ROW_FN']] = df.apply(calculate_metrics, axis=1, result_type='expand')\n", "df" ], "metadata": { "id": "WI83u4mjsEvy", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "e79db819-726c-4fae-ba17-2ea02bc73f86" }, "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count Split Location \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " Modified text New Word Count ... \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 ... \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 ... \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 ... \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 ... \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 ... \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 ... \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 ... \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 ... \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 ... \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 ... \n", "\n", " text label_gold diff \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 17 0 \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 30 4 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 24 0 \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 37 0 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 25 0 \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 26 3 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 36 0 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 17 5 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 28 0 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 19 0 \n", "\n", " Token Limit Check WORDS_REAL \\\n", "0 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... ... \n", "49916 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49917 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49918 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49919 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49920 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED ROW_TP ROW_FP ROW_TN \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 78 0 17 \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 65 4 26 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 34 0 24 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 174 0 37 \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 51 0 25 \n", "... ... ... ... ... \n", "49916 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 65 3 23 \n", "49917 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 58 0 36 \n", "49918 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ... 126 5 12 \n", "49919 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 81 0 28 \n", "49920 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 58 0 19 \n", "\n", " ROW_FN \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "49916 0 \n", "49917 0 \n", "49918 0 \n", "49919 0 \n", "49920 0 \n", "\n", "[49921 rows x 23 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...textlabel_golddiffToken Limit CheckWORDS_REALWORDS_PREDROW_TPROW_FPROW_TNROW_FN
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95...Godne polecenia ( choć paliwożerne ) są turbod...170Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...780170
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95...Natalia Partyka też przegrała 3:4 , tyle że z ...304Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...654260
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58...Niemcy chcieli wybudować trzy takie wjazdy, uk...240Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...340240
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...211...Opiekowała się rodzinami oficerów , którzy zgi...370Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...1740370
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76...W minionym tygodniu gminne dożynki odbyły się ...250Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...510250
..................................................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...91...Efektownymi porównaniami rzucali też szefowie ...263Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...653230
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...94...Film - który wywołał oburzenie krytyków w Cann...360Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...580360
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...143...Pierwszy trailer gry Dead Island zrobił wielką...175Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...1265120
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...109...O ujawnienie jego treści od dawna walczą organ...280Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...810280
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77...75 min. - na boisku walka w środku pola, a żół...190Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...580190
\n", "

49921 rows × 23 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "code", "source": [ "# prompt: 4 new column : ROW_ACC , ROW_PREC , ROW_REC , ROW_F1 based on ROW_TN,ROW_TP,ROW_FN,ROW_FP\n", "def calculate_row_metrics(row):\n", " tp = row['ROW_TP']\n", " fp = row['ROW_FP']\n", " tn = row['ROW_TN']\n", " fn = row['ROW_FN']\n", " if (tp + tn + fp + fn) == 0:\n", " accuracy = 0\n", " else:\n", " accuracy = (tp + tn) / (tp + tn + fp + fn)\n", " if (tp + fp) == 0:\n", " precision = 0\n", " else:\n", " precision = tp / (tp + fp)\n", " if (tp + fn) == 0:\n", " recall = 0\n", " else:\n", " recall = tp / (tp + fn)\n", " if (precision + recall) == 0:\n", " f1_score = 0\n", " else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " return accuracy, precision, recall, f1_score\n", "df[['ROW_ACC', 'ROW_PREC', 'ROW_REC', 'ROW_F1']] = df.apply(calculate_row_metrics, axis=1, result_type='expand')\n", "df" ], "metadata": { "id": "6PnV_NwCsJNG", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "2ff8324a-da69-46c2-a231-ce43d385451a" }, "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count Split Location \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " Modified text New Word Count ... \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 ... \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 ... \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 ... \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 ... \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 ... \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 ... \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 ... \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 ... \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 ... \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 ... \n", "\n", " WORDS_REAL \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... \n", "49916 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49917 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49918 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49919 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49920 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED ROW_TP ROW_FP \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 78 0 \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 65 4 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 34 0 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 174 0 \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 51 0 \n", "... ... ... ... \n", "49916 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 65 3 \n", "49917 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 58 0 \n", "49918 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ... 126 5 \n", "49919 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 81 0 \n", "49920 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 58 0 \n", "\n", " ROW_TN ROW_FN ROW_ACC ROW_PREC ROW_REC ROW_F1 \n", "0 17 0 1.000000 1.000000 1.0 1.000000 \n", "1 26 0 0.957895 0.942029 1.0 0.970149 \n", "2 24 0 1.000000 1.000000 1.0 1.000000 \n", "3 37 0 1.000000 1.000000 1.0 1.000000 \n", "4 25 0 1.000000 1.000000 1.0 1.000000 \n", "... ... ... ... ... ... ... \n", "49916 23 0 0.967033 0.955882 1.0 0.977444 \n", "49917 36 0 1.000000 1.000000 1.0 1.000000 \n", "49918 12 0 0.965035 0.961832 1.0 0.980545 \n", "49919 28 0 1.000000 1.000000 1.0 1.000000 \n", "49920 19 0 1.000000 1.000000 1.0 1.000000 \n", "\n", "[49921 rows x 27 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...WORDS_REALWORDS_PREDROW_TPROW_FPROW_TNROW_FNROW_ACCROW_PRECROW_RECROW_F1
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...7801701.0000001.0000001.01.000000
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...6542600.9578950.9420291.00.970149
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...3402401.0000001.0000001.01.000000
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...211...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...17403701.0000001.0000001.01.000000
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...5102501.0000001.0000001.01.000000
..................................................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...91...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...6532300.9670330.9558821.00.977444
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...94...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...5803601.0000001.0000001.01.000000
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...143...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...12651200.9650350.9618321.00.980545
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...109...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...8102801.0000001.0000001.01.000000
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...5801901.0000001.0000001.01.000000
\n", "

49921 rows × 27 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "def calculate_percentage_of_ones(row):\n", " series = row['WORDS_PRED']\n", " if len(series) == 0:\n", " return 0\n", " else:\n", " return sum(series) / len(series)\n", "df[\"Label : 1\"] = df.apply(calculate_percentage_of_ones, axis=1)\n", "df[\"Label : 0\"] = 1.0 - df[\"Label : 1\"]\n", "df" ], "metadata": { "id": "Yp3FO_HVsLiA", "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "outputId": "e07d66c6-1de3-4556-8b16-ec51b4a102b8" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POL GPT-o1 Partial Test \n", "1 POL GPT-o1 Partial Test \n", "2 POL Gemini-Flash-1.5 Partial Test \n", "3 POL GPT-o1 Partial Test \n", "4 POL Claude-Sonnet-3.5 Partial Test \n", "... ... ... ... ... \n", "49916 POL Amazon-Nova-Pro-1.0 Partial Test \n", "49917 POL Claude-Haiku-3.5 Partial Test \n", "49918 POL Aya-23 Partial Test \n", "49919 POL Aya-23 Partial Test \n", "49920 POL Claude-Haiku-3.5 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Godne polecenia (choć paliwożerne) są turbodoł... 41 \n", "1 Natalia Partyka też przegrała 3:4, tyle że z K... 52 \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 43 \n", "3 Opiekowała się rodzinami oficerów, którzy zgin... 49 \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 46 \n", "... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 43 \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 63 \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 33 \n", "49919 O ujawnienie jego treści od dawna walczą organ... 56 \n", "49920 75 min. - na boisku walka w środku pola, a żół... 32 \n", "\n", " Original Char Count Split Location \\\n", "0 344 17 \n", "1 317 30 \n", "2 297 24 \n", "3 376 37 \n", "4 325 25 \n", "... ... ... \n", "49916 311 26 \n", "49917 449 36 \n", "49918 250 17 \n", "49919 393 28 \n", "49920 192 19 \n", "\n", " Modified text New Word Count ... \\\n", "0 Godne polecenia ( choć paliwożerne ) są turbod... 95 ... \n", "1 Natalia Partyka też przegrała 3:4 , tyle że z ... 95 ... \n", "2 Niemcy chcieli wybudować trzy takie wjazdy, uk... 58 ... \n", "3 Opiekowała się rodzinami oficerów , którzy zgi... 211 ... \n", "4 W minionym tygodniu gminne dożynki odbyły się ... 76 ... \n", "... ... ... ... \n", "49916 Efektownymi porównaniami rzucali też szefowie ... 91 ... \n", "49917 Film - który wywołał oburzenie krytyków w Cann... 94 ... \n", "49918 Pierwszy trailer gry Dead Island zrobił wielką... 143 ... \n", "49919 O ujawnienie jego treści od dawna walczą organ... 109 ... \n", "49920 75 min. - na boisku walka w środku pola, a żół... 77 ... \n", "\n", " ROW_TP ROW_FP ROW_TN ROW_FN ROW_ACC ROW_PREC ROW_REC ROW_F1 \\\n", "0 78 0 17 0 1.000000 1.000000 1.0 1.000000 \n", "1 65 4 26 0 0.957895 0.942029 1.0 0.970149 \n", "2 34 0 24 0 1.000000 1.000000 1.0 1.000000 \n", "3 174 0 37 0 1.000000 1.000000 1.0 1.000000 \n", "4 51 0 25 0 1.000000 1.000000 1.0 1.000000 \n", "... ... ... ... ... ... ... ... ... \n", "49916 65 3 23 0 0.967033 0.955882 1.0 0.977444 \n", "49917 58 0 36 0 1.000000 1.000000 1.0 1.000000 \n", "49918 126 5 12 0 0.965035 0.961832 1.0 0.980545 \n", "49919 81 0 28 0 1.000000 1.000000 1.0 1.000000 \n", "49920 58 0 19 0 1.000000 1.000000 1.0 1.000000 \n", "\n", " Label : 1 Label : 0 \n", "0 0.821053 0.178947 \n", "1 0.726316 0.273684 \n", "2 0.586207 0.413793 \n", "3 0.824645 0.175355 \n", "4 0.671053 0.328947 \n", "... ... ... \n", "49916 0.747253 0.252747 \n", "49917 0.617021 0.382979 \n", "49918 0.916084 0.083916 \n", "49919 0.743119 0.256881 \n", "49920 0.753247 0.246753 \n", "\n", "[49921 rows x 29 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...ROW_TPROW_FPROW_TNROW_FNROW_ACCROW_PRECROW_RECROW_F1Label : 1Label : 0
0POLGPT-o1PartialTestGodne polecenia (choć paliwożerne) są turbodoł...4134417Godne polecenia ( choć paliwożerne ) są turbod...95...7801701.0000001.0000001.01.0000000.8210530.178947
1POLGPT-o1PartialTestNatalia Partyka też przegrała 3:4, tyle że z K...5231730Natalia Partyka też przegrała 3:4 , tyle że z ...95...6542600.9578950.9420291.00.9701490.7263160.273684
2POLGemini-Flash-1.5PartialTestNiemcy chcieli wybudować trzy takie wjazdy, uk...4329724Niemcy chcieli wybudować trzy takie wjazdy, uk...58...3402401.0000001.0000001.01.0000000.5862070.413793
3POLGPT-o1PartialTestOpiekowała się rodzinami oficerów, którzy zgin...4937637Opiekowała się rodzinami oficerów , którzy zgi...211...17403701.0000001.0000001.01.0000000.8246450.175355
4POLClaude-Sonnet-3.5PartialTestW minionym tygodniu gminne dożynki odbyły się ...4632525W minionym tygodniu gminne dożynki odbyły się ...76...5102501.0000001.0000001.01.0000000.6710530.328947
..................................................................
49916POLAmazon-Nova-Pro-1.0PartialTestEfektownymi porównaniami rzucali też szefowie ...4331126Efektownymi porównaniami rzucali też szefowie ...91...6532300.9670330.9558821.00.9774440.7472530.252747
49917POLClaude-Haiku-3.5PartialTestFilm - który wywołał oburzenie krytyków w Cann...6344936Film - który wywołał oburzenie krytyków w Cann...94...5803601.0000001.0000001.01.0000000.6170210.382979
49918POLAya-23PartialTestPierwszy trailer gry Dead Island zrobił wielką...3325017Pierwszy trailer gry Dead Island zrobił wielką...143...12651200.9650350.9618321.00.9805450.9160840.083916
49919POLAya-23PartialTestO ujawnienie jego treści od dawna walczą organ...5639328O ujawnienie jego treści od dawna walczą organ...109...8102801.0000001.0000001.01.0000000.7431190.256881
49920POLClaude-Haiku-3.5PartialTest75 min. - na boisku walka w środku pola, a żół...321921975 min. - na boisku walka w środku pola, a żół...77...5801901.0000001.0000001.01.0000000.7532470.246753
\n", "

49921 rows × 29 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "df_unchanged = df[df['Type'] == 'Unchanged']\n", "df_rewritten = df[df['Type'] == 'Rewritten']\n", "df_partial = df[df['Type'] == 'Partial']\n", "print(\"######################################\")\n", "print(\" METRICS BY TEXT TYPE : \")\n", "print(\"######################################\")\n", "AVG_ACC = df_partial['ROW_ACC'].mean()\n", "AVG_PREC = df_partial['ROW_PREC'].mean()\n", "AVG_REC = df_partial['ROW_REC'].mean()\n", "AVG_F1 = df_partial['ROW_F1'].mean()\n", "print(\"Partial Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Partial Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Partial Cases : Average Recall : \" , AVG_REC )\n", "print(\"Partial Cases : Average F1-score : \" , AVG_F1 )\n", "AVG_ACC = df_unchanged['ROW_ACC'].mean()\n", "AVG_PREC = df_unchanged['ROW_PREC'].mean()\n", "AVG_REC = df_unchanged['ROW_REC'].mean()\n", "AVG_F1 = df_unchanged['ROW_F1'].mean()\n", "print(\"Unchanged Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Unchanged Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Unchanged Cases : Average Recall : \" , AVG_REC )\n", "print(\"Unchanged Cases : Average F1-score : \" , AVG_F1 )\n", "AVG_ACC = df_rewritten['ROW_ACC'].mean()\n", "AVG_PREC = df_rewritten['ROW_PREC'].mean()\n", "AVG_REC = df_rewritten['ROW_REC'].mean()\n", "AVG_F1 = df_rewritten['ROW_F1'].mean()\n", "print(\"Rewritten Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Rewritten Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Rewritten Cases : Average Recall : \" , AVG_REC )\n", "print(\"Rewritten Cases : Average F1-score : \" , AVG_F1 )\n", "print(\"######################################\")\n", "print(\" METRICS OVERALL : \")\n", "print(\"######################################\")\n", "# prompt: print AVG_ACC, AVG_PREC , AVG_REC , AVG_F1 as mean of values in columns ROW_ACC , ROW_REC , ROW_PREC , ROW_F1 from dataframe df\n", "AVG_ACC = df['ROW_ACC'].mean()\n", "AVG_PREC = df['ROW_PREC'].mean()\n", "AVG_REC = df['ROW_REC'].mean()\n", "AVG_F1 = df['ROW_F1'].mean()\n", "print(\"All Cases : Average Accuracy:\", AVG_ACC)\n", "print(\"All Cases : Average Precision:\", AVG_PREC)\n", "print(\"All Cases : Average Recall:\", AVG_REC)\n", "print(\"All Cases : Average F1-score:\", AVG_F1)\n", "print(\"######################################\")\n", "# prompt: Also print overall ACC,PREC,REC,F1 based on values of columns ROW_TN,ROW_TP,ROW_FN,ROW_FP\n", "total_tp = df['ROW_TP'].sum()\n", "total_fp = df['ROW_FP'].sum()\n", "total_tn = df['ROW_TN'].sum()\n", "total_fn = df['ROW_FN'].sum()\n", "if (total_tp + total_tn + total_fp + total_fn) == 0:\n", " accuracy = 0\n", "else:\n", " accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)\n", "if (total_tp + total_fp) == 0:\n", " precision = 0\n", "else:\n", " precision = total_tp / (total_tp + total_fp)\n", "if (total_tp + total_fn) == 0:\n", " recall = 0\n", "else:\n", " recall = total_tp / (total_tp + total_fn)\n", "if (precision + recall) == 0:\n", " f1_score = 0\n", "else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", "print(\"Overall Accuracy:\", accuracy)\n", "print(\"Overall Precision:\", precision)\n", "print(\"Overall Recall:\", recall)\n", "print(\"Overall F1-score:\", f1_score)" ], "metadata": { "id": "cuuc9gPjsU_T", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "248d1e98-d826-495b-fd7c-ddb583520547" }, "execution_count": 28, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "######################################\n", " METRICS BY TEXT TYPE : \n", "######################################\n", "Partial Cases : Average Accuracy : 0.9570058330170521\n", "Partial Cases : Average Precision : 0.9530425145666854\n", "Partial Cases : Average Recall : 0.9900518439649512\n", "Partial Cases : Average F1-score : 0.9656579960470106\n", "Unchanged Cases : Average Accuracy : 0.5657269341134558\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9735035828125072\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9735035828125072\n", "Rewritten Cases : Average F1-score : 0.9808400644941\n", "######################################\n", " METRICS OVERALL : \n", "######################################\n", "All Cases : Average Accuracy: 0.9190410063254509\n", "All Cases : Average Precision: 0.8612414527780528\n", "All Cases : Average Recall: 0.8882211449444186\n", "All Cases : Average F1-score: 0.8694358695290949\n", "######################################\n", "Overall Accuracy: 0.932257474914811\n", "Overall Precision: 0.91659607209321\n", "Overall Recall: 0.9887619890437568\n", "Overall F1-score: 0.9513123794188812\n" ] } ] }, { "cell_type": "code", "source": [ "llm_values = df['LLM used'].unique()\n", "for llm in llm_values:\n", " print(\"LLM used:\", llm)\n", " df_llm = df[df['LLM used'] == llm]\n", " for text_type in ['Partial', 'Unchanged', 'Rewritten']:\n", " df_subset = df_llm[df_llm['Type'] == text_type]\n", " if df_subset.empty:\n", " continue\n", " avg_acc = df_subset['ROW_ACC'].mean()\n", " avg_prec = df_subset['ROW_PREC'].mean()\n", " avg_rec = df_subset['ROW_REC'].mean()\n", " avg_f1 = df_subset['ROW_F1'].mean()\n", " print(f\"{text_type} Cases : Average Accuracy : {avg_acc}\")\n", " print(f\"{text_type} Cases : Average Precision : {avg_prec}\")\n", " print(f\"{text_type} Cases : Average Recall : {avg_rec}\")\n", " print(f\"{text_type} Cases : Average F1-score : {avg_f1}\")\n", " print(\"######################################\")" ], "metadata": { "id": "9PwzmDF9xJzl", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c10eda9c-c9df-4ed7-dd87-7db912b413b9" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LLM used: GPT-o1\n", "Partial Cases : Average Accuracy : 0.9900886943011742\n", "Partial Cases : Average Precision : 0.9898292924387625\n", "Partial Cases : Average Recall : 0.9966696073322895\n", "Partial Cases : Average F1-score : 0.9931007127418364\n", "Unchanged Cases : Average Accuracy : 0.5877615582528447\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9655167815663577\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9655167815663577\n", "Rewritten Cases : Average F1-score : 0.9778338017296966\n", "######################################\n", "LLM used: Gemini-Flash-1.5\n", "Partial Cases : Average Accuracy : 0.9858661554067739\n", "Partial Cases : Average Precision : 0.982400864909452\n", "Partial Cases : Average Recall : 0.9993504463411779\n", "Partial Cases : Average F1-score : 0.9888783303226257\n", "Unchanged Cases : Average Accuracy : 0.58729546556291\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9996271186440677\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9996271186440677\n", "Rewritten Cases : Average F1-score : 0.9997943925233644\n", "######################################\n", "LLM used: Claude-Sonnet-3.5\n", "Partial Cases : Average Accuracy : 0.9876387113530368\n", "Partial Cases : Average Precision : 0.9854633152191319\n", "Partial Cases : Average Recall : 0.9983041958428098\n", "Partial Cases : Average F1-score : 0.9909198661603826\n", "Unchanged Cases : Average Accuracy : 0.5866682830873995\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9977442289456365\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9977442289456365\n", "Rewritten Cases : Average F1-score : 0.9986372251850977\n", "######################################\n", "LLM used: Mistral-Large-2411\n", "Partial Cases : Average Accuracy : 0.9543277810788495\n", "Partial Cases : Average Precision : 0.944617239146211\n", "Partial Cases : Average Recall : 0.9949578918997638\n", "Partial Cases : Average F1-score : 0.9660994084987479\n", "Unchanged Cases : Average Accuracy : 0.5492110744312836\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9759182794268021\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9759182794268021\n", "Rewritten Cases : Average F1-score : 0.9834944517449998\n", "######################################\n", "LLM used: Aya-23\n", "Partial Cases : Average Accuracy : 0.8767260605127271\n", "Partial Cases : Average Precision : 0.8618749765178467\n", "Partial Cases : Average Recall : 0.9702092195937273\n", "Partial Cases : Average F1-score : 0.8907038933798743\n", "Unchanged Cases : Average Accuracy : 0.528225624248364\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9329830932550148\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9329830932550148\n", "Rewritten Cases : Average F1-score : 0.9443295234515782\n", "######################################\n", "LLM used: Claude-Haiku-3.5\n", "Partial Cases : Average Accuracy : 0.9976961985231009\n", "Partial Cases : Average Precision : 0.9972387432394554\n", "Partial Cases : Average Recall : 0.999116069215661\n", "Partial Cases : Average F1-score : 0.9976465631544216\n", "Unchanged Cases : Average Accuracy : 0.5784118276839278\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9295246328683611\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9295246328683611\n", "Rewritten Cases : Average F1-score : 0.9514552080734575\n", "######################################\n", "LLM used: Amazon-Nova-Lite-1.0\n", "Partial Cases : Average Accuracy : 0.972236938979804\n", "Partial Cases : Average Precision : 0.9757858822682315\n", "Partial Cases : Average Recall : 0.9902413785282811\n", "Partial Cases : Average F1-score : 0.981345742681404\n", "Unchanged Cases : Average Accuracy : 0.5637605464608331\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9854525469258404\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9854525469258404\n", "Rewritten Cases : Average F1-score : 0.989592574454153\n", "######################################\n", "LLM used: Gemini-Pro-1.5\n", "Partial Cases : Average Accuracy : 0.9187598037921633\n", "Partial Cases : Average Precision : 0.9134456550182768\n", "Partial Cases : Average Recall : 0.9774618103802449\n", "Partial Cases : Average F1-score : 0.9301702277639385\n", "Unchanged Cases : Average Accuracy : 0.5683735108431864\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9941310291220241\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9941310291220241\n", "Rewritten Cases : Average F1-score : 0.9961754168701212\n", "######################################\n", "LLM used: GPT-4o\n", "Partial Cases : Average Accuracy : 0.9244309774714742\n", "Partial Cases : Average Precision : 0.9172394343338772\n", "Partial Cases : Average Recall : 0.9845155842816682\n", "Partial Cases : Average F1-score : 0.9436648041774663\n", "Unchanged Cases : Average Accuracy : 0.5567817649614593\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9738191158962258\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9738191158962258\n", "Rewritten Cases : Average F1-score : 0.9815254242748372\n", "######################################\n", "LLM used: Amazon-Nova-Pro-1.0\n", "Partial Cases : Average Accuracy : 0.9624401937848421\n", "Partial Cases : Average Precision : 0.9626958574930483\n", "Partial Cases : Average Recall : 0.9897072609652371\n", "Partial Cases : Average F1-score : 0.9741476542531047\n", "Unchanged Cases : Average Accuracy : 0.5480117972198169\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9764078752379916\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9764078752379916\n", "Rewritten Cases : Average F1-score : 0.9829331807471952\n", "######################################\n" ] } ] }, { "cell_type": "code", "source": [ "llm_values = df['LLM used'].unique()\n", "for llm in llm_values:\n", " print(\"LLM used:\", llm)\n", " df_llm = df[df['LLM used'] == llm]\n", " total_tp = df_llm['ROW_TP'].sum()\n", " total_fp = df_llm['ROW_FP'].sum()\n", " total_tn = df_llm['ROW_TN'].sum()\n", " total_fn = df_llm['ROW_FN'].sum()\n", " if (total_tp + total_tn + total_fp + total_fn) == 0:\n", " accuracy = 0\n", " else:\n", " accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)\n", " if (total_tp + total_fp) == 0:\n", " precision = 0\n", " else:\n", " precision = total_tp / (total_tp + total_fp)\n", " if (total_tp + total_fn) == 0:\n", " recall = 0\n", " else:\n", " recall = total_tp / (total_tp + total_fn)\n", " if (precision + recall) == 0:\n", " f1_score = 0\n", " else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " print(\"Overall Accuracy:\", accuracy)\n", " print(\"Overall Precision:\", precision)\n", " print(\"Overall Recall:\", recall)\n", " print(\"Overall F1-score:\", f1_score)\n", " print(\"######################################\")" ], "metadata": { "id": "02ubnS2dxq1x", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "4173bf29-e6e3-4175-84d7-c1e3a74393ba" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LLM used: GPT-o1\n", "Overall Accuracy: 0.9669718498054718\n", "Overall Precision: 0.9596984745165602\n", "Overall Recall: 0.9941733011710889\n", "Overall F1-score: 0.9766317446324714\n", "######################################\n", "LLM used: Gemini-Flash-1.5\n", "Overall Accuracy: 0.9531919109899564\n", "Overall Precision: 0.9346170627616204\n", "Overall Recall: 0.9993147963888311\n", "Overall F1-score: 0.9658837309660316\n", "######################################\n", "LLM used: Claude-Sonnet-3.5\n", "Overall Accuracy: 0.957599125765389\n", "Overall Precision: 0.9406776647941406\n", "Overall Recall: 0.9983924588963815\n", "Overall F1-score: 0.9686761456519868\n", "######################################\n", "LLM used: Mistral-Large-2411\n", "Overall Accuracy: 0.9195711681285528\n", "Overall Precision: 0.8918671292480497\n", "Overall Recall: 0.9927288864613915\n", "Overall F1-score: 0.9395989959754204\n", "######################################\n", "LLM used: Aya-23\n", "Overall Accuracy: 0.8645300875261012\n", "Overall Precision: 0.8568488707170984\n", "Overall Recall: 0.9601292957708382\n", "Overall F1-score: 0.9055537573286604\n", "######################################\n", "LLM used: Claude-Haiku-3.5\n", "Overall Accuracy: 0.9597465037246621\n", "Overall Precision: 0.9484823861935439\n", "Overall Recall: 0.991179565519455\n", "Overall F1-score: 0.9693610359474398\n", "######################################\n", "LLM used: Amazon-Nova-Lite-1.0\n", "Overall Accuracy: 0.9461589473599081\n", "Overall Precision: 0.9386435570154109\n", "Overall Recall: 0.9900713951343669\n", "Overall F1-score: 0.9636718323693113\n", "######################################\n", "LLM used: Gemini-Pro-1.5\n", "Overall Accuracy: 0.9095742915708352\n", "Overall Precision: 0.8852989934458566\n", "Overall Recall: 0.9897246615941633\n", "Overall F1-score: 0.9346039388278037\n", "######################################\n", "LLM used: GPT-4o\n", "Overall Accuracy: 0.895664637864088\n", "Overall Precision: 0.8685221555094177\n", "Overall Recall: 0.9836044043569071\n", "Overall F1-score: 0.9224879508258601\n", "######################################\n", "LLM used: Amazon-Nova-Pro-1.0\n", "Overall Accuracy: 0.9335939415280793\n", "Overall Precision: 0.9214917233260457\n", "Overall Recall: 0.9882317324538803\n", "Overall F1-score: 0.9536955305526202\n", "######################################\n" ] } ] }, { "cell_type": "code", "source": [ "df.info()" ], "metadata": { "id": "mr8VT4S9whud", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e6b77ed6-5ee9-4cad-89a5-f559f59ed153" }, "execution_count": 31, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 49921 entries, 0 to 49920\n", "Data columns (total 29 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 ISO 49921 non-null object \n", " 1 LLM used 49921 non-null object \n", " 2 Type 49921 non-null object \n", " 3 Data Split 49921 non-null object \n", " 4 Original text 49921 non-null object \n", " 5 Original Word Count 49921 non-null int64 \n", " 6 Original Char Count 49921 non-null int64 \n", " 7 Split Location 49921 non-null int64 \n", " 8 Modified text 49921 non-null object \n", " 9 New Word Count 49921 non-null int64 \n", " 10 New Char Count 49921 non-null int64 \n", " 11 id 49921 non-null int64 \n", " 12 label_pred 49921 non-null int64 \n", " 13 text 49921 non-null object \n", " 14 label_gold 49921 non-null int64 \n", " 15 diff 49921 non-null int64 \n", " 16 Token Limit Check 49921 non-null object \n", " 17 WORDS_REAL 49921 non-null object \n", " 18 WORDS_PRED 49921 non-null object \n", " 19 ROW_TP 49921 non-null int64 \n", " 20 ROW_FP 49921 non-null int64 \n", " 21 ROW_TN 49921 non-null int64 \n", " 22 ROW_FN 49921 non-null int64 \n", " 23 ROW_ACC 49921 non-null float64\n", " 24 ROW_PREC 49921 non-null float64\n", " 25 ROW_REC 49921 non-null float64\n", " 26 ROW_F1 49921 non-null float64\n", " 27 Label : 1 49921 non-null float64\n", " 28 Label : 0 49921 non-null float64\n", "dtypes: float64(6), int64(13), object(10)\n", "memory usage: 11.0+ MB\n" ] } ] }, { "cell_type": "code", "source": [ "df.to_csv(\"POL-INFERENCE-4.csv\")" ], "metadata": { "id": "AEEjGzfkw0le" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "print(\"CLICK CTRL+S, WAIT 2 SEC FOR IT TO BE SAVED, DOWNLOAD BOTH CODE AND THE CSV FILE FROM RUNTIME\")" ], "metadata": { "id": "T6TKnuFjx9ZE", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "96ffeee9-7cf6-4048-d389-64ea0846601f" }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CLICK CTRL+S, WAIT 2 SEC FOR IT TO BE SAVED, DOWNLOAD BOTH CODE AND THE CSV FILE FROM RUNTIME\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "-4QvATtXJPCT" }, "execution_count": null, "outputs": [] } ] }