{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "A100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "cf15308d3f2e49ada30be466b73a7ad8": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_c0c2bef72b5b424ea8c319352fcd9afd", "IPY_MODEL_72bc6f5bdaaa4e2c8df022c6dd5f8299", "IPY_MODEL_0f2befac587a473ea0fe439cc44d8782", "IPY_MODEL_783c73eb74c44d3b838f7e13c938941c", "IPY_MODEL_caf01ed869cb4262830b43a10e51fb89" ], "layout": "IPY_MODEL_3123687fe2dc4f869d745fa5fbcf93a9" } }, "c0c2bef72b5b424ea8c319352fcd9afd": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_1dc7dd582bae42b9b31dec146877e7cb", "placeholder": "​", "style": "IPY_MODEL_b981faab214d44198e2cdb669b2f29b0", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "72bc6f5bdaaa4e2c8df022c6dd5f8299": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_e490ced073e246a5b91c7bf90c688f08", "placeholder": "​", "style": "IPY_MODEL_a481f7f3e5654428a8e5cb47114907fe", "value": "" } }, "0f2befac587a473ea0fe439cc44d8782": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_59fee8047d4e4aea92a0aff165d5b938", "style": "IPY_MODEL_b895ba2fd6fb43c5a3d0acd1ff21c924", "value": true } }, "783c73eb74c44d3b838f7e13c938941c": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_01080cf4aff9481f8475a4dfc508f23a", "style": "IPY_MODEL_81cf76e4280c41868988982ef32fc62b", "tooltip": "" } }, "caf01ed869cb4262830b43a10e51fb89": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_58699dd790a543f9a5f276f6149b8298", "placeholder": "​", "style": "IPY_MODEL_775d5b3e765a47408b0c02b0827ee2a2", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "3123687fe2dc4f869d745fa5fbcf93a9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "1dc7dd582bae42b9b31dec146877e7cb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b981faab214d44198e2cdb669b2f29b0": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e490ced073e246a5b91c7bf90c688f08": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a481f7f3e5654428a8e5cb47114907fe": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "59fee8047d4e4aea92a0aff165d5b938": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b895ba2fd6fb43c5a3d0acd1ff21c924": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "01080cf4aff9481f8475a4dfc508f23a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "81cf76e4280c41868988982ef32fc62b": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "58699dd790a543f9a5f276f6149b8298": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "775d5b3e765a47408b0c02b0827ee2a2": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "c38c9b679a7c4ee4984aa16dc03eac15": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_10dfac0146c24d84ae0d9725a748bae9", "IPY_MODEL_342bd2bb1ee84c728565a0d90e856ec5", "IPY_MODEL_e028ef22037b425c9106cdbdab23187d" ], "layout": "IPY_MODEL_cb3bb00281aa4b549079efb1bdc7c544" } }, "10dfac0146c24d84ae0d9725a748bae9": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_5ae82b0f63f04fb298029d954a872631", "placeholder": "​", "style": "IPY_MODEL_ec675ed64e464784b6cc0f2190be6bc3", "value": "config.json: 100%" } }, "342bd2bb1ee84c728565a0d90e856ec5": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b661f343b04049a792078ae93683c5ee", "max": 772, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_fdb923edb4294cb495b8c2854b111a8a", "value": 772 } }, "e028ef22037b425c9106cdbdab23187d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9cbc7aea8f9c4c78b7e3ac1ba0fa374a", "placeholder": "​", "style": "IPY_MODEL_51801b3dc78444e7b20c611fb068c1ea", "value": " 772/772 [00:00<00:00, 67.6kB/s]" } }, "cb3bb00281aa4b549079efb1bdc7c544": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "5ae82b0f63f04fb298029d954a872631": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ec675ed64e464784b6cc0f2190be6bc3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "b661f343b04049a792078ae93683c5ee": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fdb923edb4294cb495b8c2854b111a8a": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "9cbc7aea8f9c4c78b7e3ac1ba0fa374a": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "51801b3dc78444e7b20c611fb068c1ea": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3512a389212b44f5bf24964313ba0553": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8a11dbe5d61e4a2a8f3aa77810db0f9d", "IPY_MODEL_fdbff13cf9184aaabb40f53f3ad55b31", "IPY_MODEL_0ed8981aa87f4157af6da6c027aa85a8" ], "layout": "IPY_MODEL_d5d24ffa45734a3c8031f2154c5fe0f9" } }, "8a11dbe5d61e4a2a8f3aa77810db0f9d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_638c621f3c9341b38046316abceea5d0", "placeholder": "​", "style": "IPY_MODEL_cfa5c17a54f24e4fade919805b26c7d3", "value": "tf_model.h5: 100%" } }, "fdbff13cf9184aaabb40f53f3ad55b31": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bf9abfbd39af4ba08d95c94a9991e574", "max": 1246320936, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_c13439af187445e49a96d3456577e2b1", "value": 1246320936 } }, "0ed8981aa87f4157af6da6c027aa85a8": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_27c21ac1c15441809ae6a0fa9d130c6e", "placeholder": "​", "style": "IPY_MODEL_0aa45547ba114fbabfebd5af1822d433", "value": " 1.25G/1.25G [00:05<00:00, 245MB/s]" } }, "d5d24ffa45734a3c8031f2154c5fe0f9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "638c621f3c9341b38046316abceea5d0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "cfa5c17a54f24e4fade919805b26c7d3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bf9abfbd39af4ba08d95c94a9991e574": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c13439af187445e49a96d3456577e2b1": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "27c21ac1c15441809ae6a0fa9d130c6e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0aa45547ba114fbabfebd5af1822d433": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ac4adbc3505d4342b2469745b1e5d7ea": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_ac41c21881c64c0fac31e35fa0403cf8", "IPY_MODEL_e8c486eba8fd400bbb8dc4a711bc71de", "IPY_MODEL_32ec57921c9446688c5f714c7102a17e" ], "layout": "IPY_MODEL_51f62d887a0a496497ebcd7c8895e08e" } }, "ac41c21881c64c0fac31e35fa0403cf8": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4177425ad63d4c6a888a65dacc222a43", "placeholder": "​", "style": "IPY_MODEL_38b458bc3cda49739c9de553ca004884", "value": "tokenizer_config.json: 100%" } }, "e8c486eba8fd400bbb8dc4a711bc71de": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8be4f21b003b4283a1fda2f77bde2bd4", "max": 453, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_62e592b6f4834cc790d9a22abc8b33c5", "value": 453 } }, "32ec57921c9446688c5f714c7102a17e": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_9b4b45e06353469fbe9e898b37f256e4", "placeholder": "​", "style": "IPY_MODEL_8fdbe3221cde440f975beb0813f86336", "value": " 453/453 [00:00<00:00, 37.7kB/s]" } }, "51f62d887a0a496497ebcd7c8895e08e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4177425ad63d4c6a888a65dacc222a43": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "38b458bc3cda49739c9de553ca004884": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "8be4f21b003b4283a1fda2f77bde2bd4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "62e592b6f4834cc790d9a22abc8b33c5": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "9b4b45e06353469fbe9e898b37f256e4": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8fdbe3221cde440f975beb0813f86336": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "fb3964287ade4f73a620e4284f185288": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_d6b31f346d1d40968d4302fbca9cd6b3", "IPY_MODEL_89df189d451945e3a579f2bce1ff00e2", "IPY_MODEL_0a54c5014a874ceca3fa04c2679b8dfe" ], "layout": "IPY_MODEL_c823199cef4647efa57071d3308b9c99" } }, "d6b31f346d1d40968d4302fbca9cd6b3": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ec0dea5d0c0948b88ac710d6f8b5f3ec", "placeholder": "​", "style": "IPY_MODEL_9f0bece938544ed7a80d03eca254a5e9", "value": "tokenizer.json: 100%" } }, "89df189d451945e3a579f2bce1ff00e2": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a0f33a9d1c8e441492975a69bd4d4bd2", "max": 17082660, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_2acf4cb50e944d3592572e0bec3312e2", "value": 17082660 } }, "0a54c5014a874ceca3fa04c2679b8dfe": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_68f839b7400840b6957891438b888aeb", "placeholder": "​", "style": "IPY_MODEL_1b4698f8882f4482bc4278e7b445d02b", "value": " 17.1M/17.1M [00:00<00:00, 206MB/s]" } }, "c823199cef4647efa57071d3308b9c99": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ec0dea5d0c0948b88ac710d6f8b5f3ec": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9f0bece938544ed7a80d03eca254a5e9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "a0f33a9d1c8e441492975a69bd4d4bd2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2acf4cb50e944d3592572e0bec3312e2": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "68f839b7400840b6957891438b888aeb": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1b4698f8882f4482bc4278e7b445d02b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6a63b581f4d14ccdafe52cebfacf7194": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b93a62df73854ec992c9dcba7d554c24", "IPY_MODEL_2f022591fff740ac89c771ea15a5fbe9", "IPY_MODEL_639214f516034b3ebec50ed3035b5106" ], "layout": "IPY_MODEL_84ef02b32cab488d9501e90626be1452" } }, "b93a62df73854ec992c9dcba7d554c24": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c7e6d23380e74b538688c7a646a75671", "placeholder": "​", "style": "IPY_MODEL_1af00018c9cb400581dffb977e9684a3", "value": "special_tokens_map.json: 100%" } }, "2f022591fff740ac89c771ea15a5fbe9": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_337f4bdd666246fbb43ecbef1c246a91", "max": 280, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a72527c43c2b4209a7627c3e3f4e9e35", "value": 280 } }, "639214f516034b3ebec50ed3035b5106": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_f814ff16b4f3444c9253e31cd1b3e386", "placeholder": "​", "style": "IPY_MODEL_9474a4f7cf72453f80332ebff571c777", "value": " 280/280 [00:00<00:00, 28.1kB/s]" } }, "84ef02b32cab488d9501e90626be1452": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c7e6d23380e74b538688c7a646a75671": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1af00018c9cb400581dffb977e9684a3": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "337f4bdd666246fbb43ecbef1c246a91": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a72527c43c2b4209a7627c3e3f4e9e35": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "f814ff16b4f3444c9253e31cd1b3e386": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9474a4f7cf72453f80332ebff571c777": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 303, "referenced_widgets": [ "cf15308d3f2e49ada30be466b73a7ad8", "c0c2bef72b5b424ea8c319352fcd9afd", "72bc6f5bdaaa4e2c8df022c6dd5f8299", "0f2befac587a473ea0fe439cc44d8782", "783c73eb74c44d3b838f7e13c938941c", "caf01ed869cb4262830b43a10e51fb89", "3123687fe2dc4f869d745fa5fbcf93a9", "1dc7dd582bae42b9b31dec146877e7cb", "b981faab214d44198e2cdb669b2f29b0", "e490ced073e246a5b91c7bf90c688f08", "a481f7f3e5654428a8e5cb47114907fe", "59fee8047d4e4aea92a0aff165d5b938", "b895ba2fd6fb43c5a3d0acd1ff21c924", "01080cf4aff9481f8475a4dfc508f23a", "81cf76e4280c41868988982ef32fc62b", "58699dd790a543f9a5f276f6149b8298", "775d5b3e765a47408b0c02b0827ee2a2" ] }, "id": "yrM6ZzXldMLo", "outputId": "613eeb51-ff37-4728-c212-378a7d81dedb" }, "execution_count": 3, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['id'] = 'POR' + df_train.index.astype(str) ############################################################################################################################################################################################################\n", ":16: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_dev['id'] = 'POR' + df_dev.index.astype(str) ############################################################################################################################################################################################################\n", ":18: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_test['id'] = 'POR' + df_test.index.astype(str) ############################################################################################################################################################################################################\n" ] } ], "source": [ "import pandas as pd\n", "splits = {'Arabic': 'Data-v3.1/ARA-v3-1.csv', 'Chinese': 'Data-v3.1/ZHO-v3-1.csv', 'Czech': 'Data-v3.1/CES-v3-1.csv', 'Dutch': 'Data-v3.1/NLD-v3-1.csv', 'English': 'Data-v3.1/ENG-v3-1.csv', 'French': 'Data-v3.1/FRA-v3-1.csv', 'German': 'Data-v3.1/DEU-v3-1.csv', 'Greek': 'Data-v3.1/ELL-v3-1.csv', 'Hebrew': 'Data-v3.1/HEB-v3-1.csv', 'Hindi': 'Data-v3.1/HIN-v3-1.csv', 'Indonesian': 'Data-v3.1/IND-v3-1.csv', 'Italian': 'Data-v3.1/ITA-v3-1.csv', 'Japanese': 'Data-v3.1/JPN-v3-1.csv', 'Korean': 'Data-v3.1/KOR-v3-1.csv', 'Persian': 'Data-v3.1/PES-v3-1.csv', 'Polish': 'Data-v3.1/POL-v3-1.csv', 'Portuguese': 'Data-v3.1/POR-v3-1.csv', 'Romanian': 'Data-v3.1/RON-v3-1.csv', 'Russian': 'Data-v3.1/RUS-v3-1.csv', 'Spanish': 'Data-v3.1/SPA-v3-1.csv', 'Turkish': 'Data-v3.1/TUR-v3-1.csv', 'Vietnamese': 'Data-v3.1/VIE-v3-1.csv', 'Ukrainian': 'Data-v3.1/UKR-v3-1.csv'}\n", "df = pd.read_csv(\"hf://datasets/1024m/mMGTD-Corpus/\" + splits[\"Portuguese\"]) ############################################################################################################################################################################################################\n", "df = df.sample(frac=1).reset_index(drop=True)\n", "df_train = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Train')]\n", "df_dev = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Dev')]\n", "df_test = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Test')]\n", "print(len(df_train))\n", "print(len(df_dev))\n", "print(len(df_test))\n", "POR_train = df_train.copy() ############################################################################################################################################################################################################\n", "POR_dev = df_dev.copy() ############################################################################################################################################################################################################\n", "POR_test = df_test.copy() ############################################################################################################################################################################################################\n", "df_train['id'] = 'POR' + df_train.index.astype(str) ############################################################################################################################################################################################################\n", "df_train = df_train.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_dev['id'] = 'POR' + df_dev.index.astype(str) ############################################################################################################################################################################################################\n", "df_dev = df_dev.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_test['id'] = 'POR' + df_test.index.astype(str) ############################################################################################################################################################################################################\n", "df_test = df_test.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_train = pd.concat([df_train, df_dev], ignore_index=True)\n", "print(len(df_train))\n", "print(len(df_dev))\n", "print(len(df_test))\n", "df_train.to_json('POR_train.jsonl', orient='records', lines=True)############################################################################################################################################################################################################\n", "df_test.to_json('POR_test.jsonl', orient='records', lines=True)############################################################################################################################################################################################################" ] }, { "cell_type": "code", "source": [ "!pip install torch\n", "!pip install transformers\n", "!pip install accelerate -U\n", "!pip install tqdm\n", "!pip install pytorch-crf\n", "!pip install sentencepiece" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "C6wCkGRXqQpc", "outputId": "7f87598c-3dfb-4feb-f98f-c4c1a835af83", "collapsed": true }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.5.1+cu124)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.17.0)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.5)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.10.0)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)\n", " Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)\n", " Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)\n", " Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-curand-cu12==10.3.5.147 (from torch)\n", " Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)\n", " Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)\n", " Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n", "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)\n", " Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.0)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n", "Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m16.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m18.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: nvidia-nvjitlink-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12\n", " Attempting uninstall: nvidia-nvjitlink-cu12\n", " Found existing installation: nvidia-nvjitlink-cu12 12.5.82\n", " Uninstalling nvidia-nvjitlink-cu12-12.5.82:\n", " Successfully uninstalled nvidia-nvjitlink-cu12-12.5.82\n", " Attempting uninstall: nvidia-curand-cu12\n", " Found existing installation: nvidia-curand-cu12 10.3.6.82\n", " Uninstalling nvidia-curand-cu12-10.3.6.82:\n", " Successfully uninstalled nvidia-curand-cu12-10.3.6.82\n", " Attempting uninstall: nvidia-cufft-cu12\n", " Found existing installation: nvidia-cufft-cu12 11.2.3.61\n", " Uninstalling nvidia-cufft-cu12-11.2.3.61:\n", " Successfully uninstalled nvidia-cufft-cu12-11.2.3.61\n", " Attempting uninstall: nvidia-cuda-runtime-cu12\n", " Found existing installation: nvidia-cuda-runtime-cu12 12.5.82\n", " Uninstalling nvidia-cuda-runtime-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82\n", " Attempting uninstall: nvidia-cuda-nvrtc-cu12\n", " Found existing installation: nvidia-cuda-nvrtc-cu12 12.5.82\n", " Uninstalling nvidia-cuda-nvrtc-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.5.82\n", " Attempting uninstall: nvidia-cuda-cupti-cu12\n", " Found existing installation: nvidia-cuda-cupti-cu12 12.5.82\n", " Uninstalling nvidia-cuda-cupti-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-cupti-cu12-12.5.82\n", " Attempting uninstall: nvidia-cublas-cu12\n", " Found existing installation: nvidia-cublas-cu12 12.5.3.2\n", " Uninstalling nvidia-cublas-cu12-12.5.3.2:\n", " Successfully uninstalled nvidia-cublas-cu12-12.5.3.2\n", " Attempting uninstall: nvidia-cusparse-cu12\n", " Found existing installation: nvidia-cusparse-cu12 12.5.1.3\n", " Uninstalling nvidia-cusparse-cu12-12.5.1.3:\n", " Successfully uninstalled nvidia-cusparse-cu12-12.5.1.3\n", " Attempting uninstall: nvidia-cudnn-cu12\n", " Found existing installation: nvidia-cudnn-cu12 9.3.0.75\n", " Uninstalling nvidia-cudnn-cu12-9.3.0.75:\n", " Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75\n", " Attempting uninstall: nvidia-cusolver-cu12\n", " Found existing installation: nvidia-cusolver-cu12 11.6.3.83\n", " Uninstalling nvidia-cusolver-cu12-11.6.3.83:\n", " Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83\n", "Successfully installed nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.48.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.17.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.28.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (24.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.0)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (2024.10.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (4.12.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2025.1.31)\n", "Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (1.3.0)\n", "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.11/dist-packages (from accelerate) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (24.2)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from accelerate) (5.9.5)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from accelerate) (6.0.2)\n", "Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (2.5.1+cu124)\n", "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.28.1)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.5.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (3.17.0)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2024.10.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.1.5)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (9.1.0.70)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.5.8)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (11.2.1.3)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (10.3.5.147)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (11.6.1.9)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.3.1.170)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (2.21.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.1.0)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch>=2.0.0->accelerate) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=2.0.0->accelerate) (3.0.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2025.1.31)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n", "Collecting pytorch-crf\n", " Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)\n", "Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)\n", "Installing collected packages: pytorch-crf\n", "Successfully installed pytorch-crf-0.7.2\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "os.makedirs(\"./runs/exp_seed\", exist_ok=True)\n", "os.makedirs(\"./runs/exp_seed/logs\", exist_ok=True)\n", "os.makedirs(\"./runs/exp_seed/xlmlongformerbase\", exist_ok=True)" ], "metadata": { "id": "if7zZ-egqSrE" }, "execution_count": 6, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import json\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "from transformers.trainer_callback import TrainerState\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.nn.utils.rnn import pad_sequence\n", "import transformers\n", "from torch import nn\n", "from transformers import AutoModel, AutoConfig\n", "from torchcrf import CRF\n", "from torch.cuda.amp import autocast\n", "from transformers import Trainer\n", "from tqdm import tqdm\n", "import numpy as np\n", "import logging\n", "import glob\n", "from tqdm import tqdm\n", "from dataclasses import dataclass, field\n", "logging.basicConfig(level=logging.INFO)\n", "logger = logging.getLogger()\n", "@dataclass\n", "class ModelConfig:\n", " model_path = \"hyperonym/xlm-roberta-longformer-base-16384\"\n", " model_checkpoint_dir = \"./runs\"\n", "@dataclass\n", "class DatasetConfig:\n", " train_file = \"/content/POR_train.jsonl\"############################################################################################################################################################################################################\n", " test_files = [\"/content/POR_test.jsonl\"]############################################################################################################################################################################################################\n", "@dataclass\n", "class TrainingArgsConfig:\n", " do_train = False\n", " do_predict = False\n", " seed = 1024\n", " output_dir = \"./runs/exp_seed\"\n", " logging_steps = 160\n", " logging_dir = \"./runs/exp_seed\"\n", " num_train_epochs = 30\n", " per_device_train_batch_size = 12\n", " per_device_eval_batch_size = 12\n", " max_length = 2048\n", "model_args = ModelConfig()\n", "data_args = DatasetConfig()\n", "training_args = TrainingArgsConfig()\n", "class CRFTrainer(Trainer):\n", " def __init__(self, *args, **kwargs):\n", " super().__init__(*args, **kwargs)\n", " def compute_loss(self, model, inputs, return_outputs=False):\n", " print(inputs.keys())\n", " labels = inputs.pop(\"labels\")\n", " outputs = model(**inputs)\n", " emissions = outputs[0]\n", " mask = inputs[\"attention_mask\"]\n", " crf_loss = -model.crf(emissions, labels, mask=mask)\n", " return crf_loss\n", " def training_step(self, model, inputs):\n", " loss = self.compute_loss(model, inputs)\n", " return {\"loss\": loss, \"inputs\": inputs}\n", "class AutoModelCRF(nn.Module):\n", " def __init__(self, model_name_or_path, dropout=0.075):\n", " super(AutoModelCRF, self).__init__()\n", " self.config = AutoConfig.from_pretrained(model_name_or_path)\n", " self.num_labels = 2\n", " self.encoder = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, config=self.config, from_tf=True)\n", " self.dropout = nn.Dropout(dropout)\n", " self.linear = nn.Linear(self.config.hidden_size, self.num_labels)\n", " self.crf = CRF(self.num_labels, batch_first=True)\n", " def forward(self, input_ids, attention_mask, labels=None):\n", " inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}\n", " outputs = self.encoder(**inputs)\n", " seq_output = outputs[0]\n", " seq_output = self.dropout(seq_output)\n", " emission = self.linear(seq_output)\n", " if labels is None:\n", " tags = self.crf.decode(emission, attention_mask.byte())\n", " tags_padded = []\n", " for idx, sequence in enumerate(tags):\n", " if len(attention_mask[idx]) > len(sequence):\n", " tag_padded = sequence + [sequence[-1]]*(len(attention_mask[idx])-len(sequence))\n", " else:\n", " tag_padded = sequence\n", " tags_padded.append(tag_padded)\n", " out = np.array(tags_padded)\n", " return out\n", " else:\n", " crf_loss = -self.crf(emission, labels, mask=attention_mask.byte())\n", " return crf_loss\n", "def evaluate_position_difference(actual_position, predicted_position):\n", " return abs(actual_position - predicted_position)\n", "def get_start_position(sequence, mapping=None, token_level=True):\n", " if mapping is not None:\n", " mask = mapping != -100\n", " sequence = sequence[mask]\n", " mapping = mapping[mask]\n", " change_indices = np.where(np.diff(sequence) == 1)[0]\n", " if len(change_indices) > 0:\n", " value = change_indices[0] + 1\n", " else:\n", " value = 0 if sequence[0] == 1 else len(sequence) - 1\n", " if not token_level:\n", " value = mapping[value] if mapping is not None else value\n", " return value\n", "def evaluate_machine_start_position(labels, predictions, idx2word=None, token_level=False):\n", " actual_starts = []\n", " predicted_starts = []\n", " if not token_level and idx2word is None:\n", " raise ValueError(\"idx2word must be provided if evaluation is at word level (token_level=False)\")\n", " for idx in range(labels.shape[0]):\n", " predict, label, mapping = (predictions[idx][1:len(labels[idx])], labels[idx][1:len(labels[idx])], idx2word[idx][1:len(labels[idx])] if not token_level else None,)\n", " predicted_value = get_start_position(predict, mapping, token_level)\n", " actual_value = get_start_position(label, mapping, token_level)\n", " predicted_starts.append(predicted_value)\n", " actual_starts.append(actual_value)\n", " position_differences = [ evaluate_position_difference(actual, predict) for actual, predict in zip(actual_starts, predicted_starts) ]\n", " mean_position_difference = np.mean(position_differences)\n", " return mean_position_difference\n", "def compute_metrics(p):\n", " pred, labels = p\n", " mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True)\n", " return {\"mean_absolute_diff\": mean_absolute_diff,}\n", "def training_loop(model, optimizer, train_dataloader, device):\n", " model.train()\n", " total_loss = 0\n", " for step, batch in enumerate(tqdm(train_dataloader)):\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " labels = batch[\"labels\"].to(device)\n", " optimizer.zero_grad()\n", " loss = model(input_ids, attention_mask, labels=labels)\n", " loss.backward()\n", " optimizer.step()\n", " logger.info(f\"Step {step}: {loss.item():.4f}\")\n", " total_loss += loss.item()\n", " avg_loss = total_loss/len(train_dataloader)\n", " print(f\"Training loss: {avg_loss:.4f}\")\n", "def predict(model, test_dataloader, device):\n", " all_preds = []\n", " with torch.no_grad():\n", " for batch in tqdm(test_dataloader):\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " preds = model(input_ids, attention_mask)\n", " all_preds.extend(preds)\n", " out = np.array(all_preds)\n", " print(out.shape)\n", " return out\n", "def save_model(model_name, model, optimizer, epoch, output_dir): # train_mae, val_mae,\n", " if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)\n", " checkpoint = {'model_state_dict': model.state_dict(),'optimizer_state_dict': optimizer.state_dict()} # 'train_mae': train_mae,'val_mae': val_mae,\n", " model_name = model_name.replace(\"/\", \"-\")\n", " file_path = os.path.join(output_dir, f\"{model_name}-epoch-{epoch}.pt\")\n", " print(file_path)\n", " torch.save(checkpoint, file_path)\n", " logger.info(f\"Model has been saved successfully to {file_path}\")\n", "class Semeval_Data(torch.utils.data.Dataset):\n", " def __init__(self, data_path, model_name, max_length=512, inference=False, debug=False):\n", " with open(data_path, \"r\") as f:\n", " self.data = [json.loads(line) for line in f]\n", " self.inference = inference\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " self.max_length = max_length\n", " self.debug = debug\n", " def __len__(self):\n", " return len(self.data)\n", " def __getitem__(self, idx):\n", " text = self.data[idx][\"text\"]\n", " id = self.data[idx][\"id\"]\n", " label = None\n", " labels_available = \"label\" in self.data[idx]\n", " if labels_available:\n", " label = self.data[idx][\"label\"]\n", " labels = []\n", " corresponding_word = []\n", " tokens = []\n", " input_ids = []\n", " attention_mask = []\n", " for jdx, word in enumerate(text.split(\" \")):\n", " word_encoded = self.tokenizer.tokenize(word)\n", " sub_words = len(word_encoded)\n", " if labels_available:\n", " is_machine_text = 1 if jdx >= label else 0\n", " labels.extend([is_machine_text] * sub_words)\n", " corresponding_word.extend([jdx] * sub_words)\n", " tokens.extend(word_encoded)\n", " input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))\n", " attention_mask.extend([1] * sub_words)\n", " if len(input_ids) < self.max_length - 2:\n", " input_ids = ( [0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2) )\n", " if labels_available:\n", " labels = [0] + labels + [labels[-1]] * (self.max_length - len(labels) - 1)\n", " attention_mask = ( [1] + attention_mask + [1] + [0] * (self.max_length - len(attention_mask) - 2) )\n", " corresponding_word = ( [-100] + corresponding_word + [-100] * (self.max_length - len(corresponding_word) - 1) )\n", " tokens = ( [\"\"] + tokens + [\"\"] + [\"\"] * (self.max_length - len(tokens) - 2) )\n", " else:\n", " input_ids = [0] + input_ids[: self.max_length - 2] + [2]\n", " if labels_available:\n", " labels = [0] + labels[: self.max_length - 2] + [labels[self.max_length - 3]]\n", " corresponding_word = ( [-100] + corresponding_word[: self.max_length - 2] + [-100] )\n", " attention_mask = [1] + attention_mask[: self.max_length - 2] + [1]\n", " tokens = [\"\"] + tokens[: self.max_length - 2] + [\"\"]\n", " encoded = {}\n", " if labels_available:\n", " encoded[\"labels\"] = torch.tensor(labels)\n", " encoded[\"input_ids\"] = torch.tensor(input_ids)\n", " encoded[\"attention_mask\"] = torch.tensor(attention_mask)\n", " if labels_available:\n", " assert encoded[\"input_ids\"].shape == encoded[\"labels\"].shape\n", " if self.debug and not self.inference:\n", " encoded[\"partial_human_review\"] = \" \".join(text.split(\" \")[:label])\n", " if self.inference:\n", " encoded[\"text\"] = text\n", " encoded[\"id\"] = id\n", " encoded[\"corresponding_word\"] = corresponding_word\n", " return encoded\n", "if __name__ == \"__main__\":\n", " model_args = ModelConfig()\n", " data_args = DatasetConfig()\n", " training_args = TrainingArgsConfig()\n", " transformers.set_seed(training_args.seed)\n", " model_path = model_args.model_path\n", " model_checkpoint_dir = model_args.model_checkpoint_dir\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " model = AutoModelCRF(model_path).to(device)\n", " optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n", " train_set = Semeval_Data(data_args.train_file, model_path, max_length=training_args.max_length)\n", " train_dataloader = DataLoader(train_set, batch_size=training_args.per_device_train_batch_size, shuffle=True)\n", " train_eval_dataloader = DataLoader(train_set, batch_size=training_args.per_device_eval_batch_size, shuffle=False)\n", " if training_args.do_train:\n", " logger.info(\"Training...\")\n", " logger.info(\"*** Train Dataset ***\")\n", " logger.info(f\"Number of samples: {len(train_set)}\")\n", " num_train_epochs = training_args.num_train_epochs\n", " for epoch in tqdm(range(num_train_epochs)):\n", " training_loop(model, optimizer, train_dataloader, device)\n", " save_model(model_path, model, optimizer, epoch, model_checkpoint_dir) # ,train_mse ,val_mse" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 301, "referenced_widgets": [ "c38c9b679a7c4ee4984aa16dc03eac15", "10dfac0146c24d84ae0d9725a748bae9", "342bd2bb1ee84c728565a0d90e856ec5", "e028ef22037b425c9106cdbdab23187d", "cb3bb00281aa4b549079efb1bdc7c544", "5ae82b0f63f04fb298029d954a872631", "ec675ed64e464784b6cc0f2190be6bc3", "b661f343b04049a792078ae93683c5ee", "fdb923edb4294cb495b8c2854b111a8a", "9cbc7aea8f9c4c78b7e3ac1ba0fa374a", "51801b3dc78444e7b20c611fb068c1ea", "3512a389212b44f5bf24964313ba0553", "8a11dbe5d61e4a2a8f3aa77810db0f9d", "fdbff13cf9184aaabb40f53f3ad55b31", "0ed8981aa87f4157af6da6c027aa85a8", "d5d24ffa45734a3c8031f2154c5fe0f9", "638c621f3c9341b38046316abceea5d0", "cfa5c17a54f24e4fade919805b26c7d3", "bf9abfbd39af4ba08d95c94a9991e574", "c13439af187445e49a96d3456577e2b1", "27c21ac1c15441809ae6a0fa9d130c6e", "0aa45547ba114fbabfebd5af1822d433", "ac4adbc3505d4342b2469745b1e5d7ea", "ac41c21881c64c0fac31e35fa0403cf8", "e8c486eba8fd400bbb8dc4a711bc71de", "32ec57921c9446688c5f714c7102a17e", "51f62d887a0a496497ebcd7c8895e08e", "4177425ad63d4c6a888a65dacc222a43", "38b458bc3cda49739c9de553ca004884", "8be4f21b003b4283a1fda2f77bde2bd4", "62e592b6f4834cc790d9a22abc8b33c5", "9b4b45e06353469fbe9e898b37f256e4", "8fdbe3221cde440f975beb0813f86336", "fb3964287ade4f73a620e4284f185288", "d6b31f346d1d40968d4302fbca9cd6b3", "89df189d451945e3a579f2bce1ff00e2", "0a54c5014a874ceca3fa04c2679b8dfe", "c823199cef4647efa57071d3308b9c99", "ec0dea5d0c0948b88ac710d6f8b5f3ec", "9f0bece938544ed7a80d03eca254a5e9", "a0f33a9d1c8e441492975a69bd4d4bd2", "2acf4cb50e944d3592572e0bec3312e2", "68f839b7400840b6957891438b888aeb", "1b4698f8882f4482bc4278e7b445d02b", "6a63b581f4d14ccdafe52cebfacf7194", "b93a62df73854ec992c9dcba7d554c24", "2f022591fff740ac89c771ea15a5fbe9", "639214f516034b3ebec50ed3035b5106", "84ef02b32cab488d9501e90626be1452", "c7e6d23380e74b538688c7a646a75671", "1af00018c9cb400581dffb977e9684a3", "337f4bdd666246fbb43ecbef1c246a91", "a72527c43c2b4209a7627c3e3f4e9e35", "f814ff16b4f3444c9253e31cd1b3e386", "9474a4f7cf72453f80332ebff571c777" ] }, "id": "tXBLrJp0quLE", "outputId": "d3d03857-eea0-4372-ec1a-fa2cdfac500c" }, "execution_count": 7, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "config.json: 0%| | 0.00/772 [00:00] 3.32G 23.0MB/s in 2m 27s \n", "\n", "2025-02-12 01:08:03 (23.2 MB/s) - ‘POR-xlm-longformer’ saved [3563459222/3563459222]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "model = AutoModelCRF(model_args.model_path).to(device)\n", "tokenizer = AutoTokenizer.from_pretrained(model_args.model_path)\n", "checkpoint = torch.load('POR-xlm-longformer')############################################################################################################################################################################################################\n", "model.load_state_dict(checkpoint['model_state_dict'])\n", "model.eval()\n", "test_sets = []\n", "for test_file in data_args.test_files:\n", " test_set = Semeval_Data(test_file, model_args.model_path, max_length=training_args.max_length, inference=True)\n", " test_dataloader = DataLoader(test_set, batch_size=training_args.per_device_eval_batch_size, shuffle=False)\n", " test_sets.append(test_dataloader)\n", "logger.info(\"Predicting...\")\n", "logger.info(\"*** Test Datasets ***\")\n", "logger.info(f\"Number of sets: {len(test_sets)}\")\n", "for idx, test_set in enumerate(test_sets):\n", " logger.info(f\"Test Dataset {idx + 1}\")\n", " logger.info(f\"Number of samples: {len(test_set)}\")\n", " predictions = predict(model, test_set, device)\n", " corresponding_words = []\n", " ids = []\n", " for batch in test_set:\n", " corr_word_tensors = [torch.tensor(cw) for cw in batch['corresponding_word']]\n", " corr_word_padded = torch.nn.utils.rnn.pad_sequence(corr_word_tensors, batch_first=True, padding_value=-100)\n", " corr_word = np.transpose(corr_word_padded.numpy(), (1, 0))\n", " ids.extend(batch[\"id\"])\n", " corresponding_words.extend(corr_word)\n", " corresponding_words = np.array(corresponding_words)\n", " logger.info(\"Predictions completed!\")\n", " df_ids = []\n", " df_labels = []\n", " for id, pred, corr_word in zip(ids, predictions, corresponding_words):\n", " df_ids.append(id)\n", " df_labels.append(get_start_position(pred, corr_word, token_level=False))\n", " df = pd.DataFrame({\"id\": df_ids, \"label\": df_labels})\n", " file_name = os.path.basename(test_file)\n", " file_dirs = os.path.join(training_args.output_dir, \"predictions\")\n", " os.makedirs(file_dirs, exist_ok=True)\n", " file_path = os.path.join(file_dirs, file_name)\n", " records = df.to_dict(\"records\")\n", " with open(file_path, \"w\") as f:\n", " for record in records:\n", " f.write(json.dumps(record) + \"\\n\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "fIMLFzDxrVSA", "outputId": "9f0c9e85-427f-4c25-ca4e-cde653a87b7b" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "All TF 2.0 model weights were used when initializing LongformerModel.\n", "\n", "All the weights of LongformerModel were initialized from the TF 2.0 model.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use LongformerModel for predictions without further training.\n", ":4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " checkpoint = torch.load('POR-xlm-longformer')############################################################################################################################################################################################################\n", " 0%| | 0/4991 [00:00:22: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " corr_word_tensors = [torch.tensor(cw) for cw in batch['corresponding_word']]\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install jsonlines\n", "import pandas as pd\n", "import jsonlines\n", "jsonl_file_path = '/content/runs/exp_seed/predictions/POR_test.jsonl'############################################################################################################################################################################################################\n", "def display_jsonl_as_dataframe(file_path):\n", " data = []\n", " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " data.append(obj)\n", " df = pd.DataFrame(data)\n", " return df\n", "jsonl_df = display_jsonl_as_dataframe(jsonl_file_path)\n", "jsonl_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 527 }, "id": "yutpCG-Drcjn", "outputId": "0824122f-728f-48e0-dab3-e67490641ab4" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting jsonlines\n", " Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n", "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.11/dist-packages (from jsonlines) (25.1.0)\n", "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", "Installing collected packages: jsonlines\n", "Successfully installed jsonlines-4.0.0\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " id label\n", "0 POR0 69\n", "1 POR4 17\n", "2 POR6 118\n", "3 POR7 34\n", "4 POR8 26\n", "... ... ...\n", "59883 POR119771 77\n", "59884 POR119774 17\n", "59885 POR119779 161\n", "59886 POR119782 41\n", "59887 POR119783 26\n", "\n", "[59888 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel
0POR069
1POR417
2POR6118
3POR734
4POR826
.........
59883POR11977177
59884POR11977417
59885POR119779161
59886POR11978241
59887POR11978326
\n", "

59888 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "jsonl_df", "summary": "{\n \"name\": \"jsonl_df\",\n \"rows\": 59888,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59888,\n \"samples\": [\n \"POR14210\",\n \"POR50610\",\n \"POR17108\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 205,\n \"min\": 0,\n \"max\": 1638,\n \"num_unique_values\": 1404,\n \"samples\": [\n 822,\n 217,\n 1268\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 10 } ] }, { "cell_type": "code", "source": [ "jsonl_file_path = '/content/POR_test.jsonl'############################################################################################################################################################################################################\n", "def display_jsonl_as_dataframe(file_path):\n", " data = []\n", " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " data.append(obj)\n", " df = pd.DataFrame(data)\n", " return df\n", "jsonl_df_gold = display_jsonl_as_dataframe(jsonl_file_path)\n", "jsonl_df_gold" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "nLm2KGliriEN", "outputId": "8702fe21-c131-4041-8f7a-2a33678864f7" }, "execution_count": 11, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count label \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " text New Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 \n", "2 A decisão foi anunciada depois que o julgament... 192 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 \n", "59887 Existem indicações de que os candidatos da Esp... 95 \n", "\n", " New Char Count id \n", "0 436 POR0 \n", "1 1026 POR4 \n", "2 1229 POR6 \n", "3 723 POR7 \n", "4 751 POR8 \n", "... ... ... \n", "59883 1295 POR119771 \n", "59884 512 POR119774 \n", "59885 1459 POR119779 \n", "59886 722 POR119782 \n", "59887 564 POR119783 \n", "\n", "[59888 rows x 12 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountlabeltextNew Word CountNew Char Countid
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70436POR0
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...1431026POR4
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...1921229POR6
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118723POR7
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130751POR8
.......................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...2021295POR119771
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...79512POR119774
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...2421459POR119779
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...125722POR119782
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...95564POR119783
\n", "

59888 rows × 12 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "jsonl_df_gold", "summary": "{\n \"name\": \"jsonl_df_gold\",\n \"rows\": 59888,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POR\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Unchanged\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59881,\n \"samples\": [\n \"De acordo com a Superintend\\u00eancia da Infraero em Manaus, o bimotor Seneca, modelo PA 34, iria para Santar\\u00e9m (PA) e caiu antes de atingir a altitude de cruzeiro. As condi\\u00e7\\u00f5es meteorol\\u00f3gicas, no momento do acidente, eram boas. O aeroporto est\\u00e1 operando normalmente.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 410,\n \"min\": 5,\n \"max\": 12061,\n \"num_unique_values\": 2346,\n \"samples\": [\n 1583\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2501,\n \"min\": 26,\n \"max\": 75639,\n \"num_unique_values\": 7856,\n \"samples\": [\n 6437\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 249,\n \"min\": 1,\n \"max\": 8516,\n \"num_unique_values\": 1669,\n \"samples\": [\n 488\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1509,\n \"min\": 6,\n \"max\": 51592,\n \"num_unique_values\": 5972,\n \"samples\": [\n 3425\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59888,\n \"samples\": [\n \"POR14210\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 11 } ] }, { "cell_type": "code", "source": [ "jsonl_df = jsonl_df.rename(columns={'label': 'label_pred'})\n", "jsonl_df_gold = jsonl_df_gold.rename(columns={'label': 'label_gold'})\n", "merged_df = pd.merge(jsonl_df[['id', 'label_pred']], jsonl_df_gold[['id','text','label_gold']], on='id')\n", "merged_df['diff'] = (merged_df['label_pred'] - merged_df['label_gold']).abs()\n", "merged_df['id'] = merged_df['id'].str[3:].astype(int)\n", "merged_df = POR_test.merge(merged_df, left_index=True, right_on='id', how='outer')############################################################################################################################################################################################################\n", "merged_df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "id": "wFmwSZsirsFY", "outputId": "b1048e1b-97f6-40ab-fc0b-015ddc09e274" }, "execution_count": 12, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count Split Location \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " Modified text New Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 \n", "2 A decisão foi anunciada depois que o julgament... 192 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 \n", "59887 Existem indicações de que os candidatos da Esp... 95 \n", "\n", " New Char Count id label_pred \\\n", "0 436 0 69 \n", "1 1026 4 17 \n", "2 1229 6 118 \n", "3 723 7 34 \n", "4 751 8 26 \n", "... ... ... ... \n", "59883 1295 119771 77 \n", "59884 512 119774 17 \n", "59885 1459 119779 161 \n", "59886 722 119782 41 \n", "59887 564 119783 26 \n", "\n", " text label_gold diff \n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 1 \n", "1 'Eu expliquei nossa posição de que o que é nec... 20 3 \n", "2 A decisão foi anunciada depois que o julgament... 118 0 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 36 2 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 26 0 \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 83 6 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 35 18 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 161 0 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 30 11 \n", "59887 Existem indicações de que os candidatos da Esp... 23 3 \n", "\n", "[59888 rows x 16 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiff
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70436069Se se tratasse de uma fábrica de sapatos ainda...701
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...1431026417'Eu expliquei nossa posição de que o que é nec...203
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...19212296118A decisão foi anunciada depois que o julgament...1180
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118723734O Bankia , criado em 2010 depois da fusão de s...362
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130751826O DVD ‘ Karaokê ’ , também de dezembro , já ve...260
...................................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...202129511977177Sherk e Fiona, Pocahontas, a Capuchinho Vermel...836
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...7951211977417'Foi muito construtivo para mim (o estágio de ...3518
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...2421459119779161de 2008 após a queda do 6º andar do edifício L...1610
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...12572211978241Segundo o levantamento, em São Paulo, o preço ...3011
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...9556411978326Existem indicações de que os candidatos da Esp...233
\n", "

59888 rows × 16 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 59888,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POR\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Unchanged\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59881,\n \"samples\": [\n \"De acordo com a Superintend\\u00eancia da Infraero em Manaus, o bimotor Seneca, modelo PA 34, iria para Santar\\u00e9m (PA) e caiu antes de atingir a altitude de cruzeiro. As condi\\u00e7\\u00f5es meteorol\\u00f3gicas, no momento do acidente, eram boas. O aeroporto est\\u00e1 operando normalmente.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 410,\n \"min\": 5,\n \"max\": 12061,\n \"num_unique_values\": 2346,\n \"samples\": [\n 1583\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2501,\n \"min\": 26,\n \"max\": 75639,\n \"num_unique_values\": 7856,\n \"samples\": [\n 6437\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 249,\n \"min\": 1,\n \"max\": 8516,\n \"num_unique_values\": 1669,\n \"samples\": [\n 488\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1509,\n \"min\": 6,\n \"max\": 51592,\n \"num_unique_values\": 5972,\n \"samples\": [\n 3425\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 34619,\n \"min\": 0,\n \"max\": 119783,\n \"num_unique_values\": 59888,\n \"samples\": [\n 14210\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 205,\n \"min\": 0,\n \"max\": 1638,\n \"num_unique_values\": 1404,\n \"samples\": [\n 822\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 86,\n \"min\": 0,\n \"max\": 7281,\n \"num_unique_values\": 535,\n \"samples\": [\n 70\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 12 } ] }, { "cell_type": "code", "source": [ "df = merged_df.copy()\n", "tokenizer = AutoTokenizer.from_pretrained(\"hyperonym/xlm-roberta-longformer-base-16384\") # USE SAME TOKENIZER AS USED IN TRAINING\n", "def check_split_position(row):\n", " text = row['Modified text']\n", " words = text.split()\n", " cumulative_tokens = 0\n", " for i in range(row['Split Location']): # Assuming Split Location is 1-based index\n", " tokens = tokenizer.tokenize(words[i])\n", " cumulative_tokens += len(tokens)\n", " if cumulative_tokens > 2048: # Check if we've already passed 2048 tokens\n", " return \"Outside\"\n", " return \"Inside\"\n", "df['Token Limit Check'] = df.apply(check_split_position, axis=1)\n", "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "id": "n3EtjBRXr53j", "outputId": "4b3cbca0-7ee4-4643-c6b6-a996293736b5" }, "execution_count": 13, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count Split Location \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " Modified text New Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 \n", "2 A decisão foi anunciada depois que o julgament... 192 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 \n", "59887 Existem indicações de que os candidatos da Esp... 95 \n", "\n", " New Char Count id label_pred \\\n", "0 436 0 69 \n", "1 1026 4 17 \n", "2 1229 6 118 \n", "3 723 7 34 \n", "4 751 8 26 \n", "... ... ... ... \n", "59883 1295 119771 77 \n", "59884 512 119774 17 \n", "59885 1459 119779 161 \n", "59886 722 119782 41 \n", "59887 564 119783 26 \n", "\n", " text label_gold diff \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 1 \n", "1 'Eu expliquei nossa posição de que o que é nec... 20 3 \n", "2 A decisão foi anunciada depois que o julgament... 118 0 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 36 2 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 26 0 \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 83 6 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 35 18 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 161 0 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 30 11 \n", "59887 Existem indicações de que os candidatos da Esp... 23 3 \n", "\n", " Token Limit Check \n", "0 Inside \n", "1 Inside \n", "2 Inside \n", "3 Inside \n", "4 Inside \n", "... ... \n", "59883 Inside \n", "59884 Inside \n", "59885 Inside \n", "59886 Inside \n", "59887 Inside \n", "\n", "[59888 rows x 17 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiffToken Limit Check
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70436069Se se tratasse de uma fábrica de sapatos ainda...701Inside
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...1431026417'Eu expliquei nossa posição de que o que é nec...203Inside
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...19212296118A decisão foi anunciada depois que o julgament...1180Inside
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118723734O Bankia , criado em 2010 depois da fusão de s...362Inside
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130751826O DVD ‘ Karaokê ’ , também de dezembro , já ve...260Inside
......................................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...202129511977177Sherk e Fiona, Pocahontas, a Capuchinho Vermel...836Inside
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...7951211977417'Foi muito construtivo para mim (o estágio de ...3518Inside
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...2421459119779161de 2008 após a queda do 6º andar do edifício L...1610Inside
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...12572211978241Segundo o levantamento, em São Paulo, o preço ...3011Inside
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...9556411978326Existem indicações de que os candidatos da Esp...233Inside
\n", "

59888 rows × 17 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 59888,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POR\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Unchanged\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59881,\n \"samples\": [\n \"De acordo com a Superintend\\u00eancia da Infraero em Manaus, o bimotor Seneca, modelo PA 34, iria para Santar\\u00e9m (PA) e caiu antes de atingir a altitude de cruzeiro. As condi\\u00e7\\u00f5es meteorol\\u00f3gicas, no momento do acidente, eram boas. O aeroporto est\\u00e1 operando normalmente.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 410,\n \"min\": 5,\n \"max\": 12061,\n \"num_unique_values\": 2346,\n \"samples\": [\n 1583\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2501,\n \"min\": 26,\n \"max\": 75639,\n \"num_unique_values\": 7856,\n \"samples\": [\n 6437\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 249,\n \"min\": 1,\n \"max\": 8516,\n \"num_unique_values\": 1669,\n \"samples\": [\n 488\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1509,\n \"min\": 6,\n \"max\": 51592,\n \"num_unique_values\": 5972,\n \"samples\": [\n 3425\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 34619,\n \"min\": 0,\n \"max\": 119783,\n \"num_unique_values\": 59888,\n \"samples\": [\n 14210\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 205,\n \"min\": 0,\n \"max\": 1638,\n \"num_unique_values\": 1404,\n \"samples\": [\n 822\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 86,\n \"min\": 0,\n \"max\": 7281,\n \"num_unique_values\": 535,\n \"samples\": [\n 70\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Token Limit Check\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Outside\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 13 } ] }, { "cell_type": "code", "source": [ "df['Token Limit Check'].value_counts()" ], "metadata": { "id": "gWfUnO17r8zb", "colab": { "base_uri": "https://localhost:8080/", "height": 178 }, "outputId": "50f48aae-8b55-4bbb-ea4c-0ef8bcf34d98" }, "execution_count": 14, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Token Limit Check\n", "Inside 59575\n", "Outside 313\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
Token Limit Check
Inside59575
Outside313
\n", "

" ] }, "metadata": {}, "execution_count": 14 } ] }, { "cell_type": "code", "source": [ "df['Split Location'].max()" ], "metadata": { "id": "HdNmbX6yr_Lv", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "f9e44808-60e3-4206-df61-21ba24a87247" }, "execution_count": 15, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "8403" ] }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "def create_word_series(row, column_name):\n", " word_count = row['New Word Count']\n", " split_location = row[column_name]\n", " series = [0] * split_location + [1] * (word_count - split_location)\n", " return series\n", "df['WORDS_REAL'] = df.apply(create_word_series, axis=1, args=('Split Location',))\n", "df['WORDS_PRED'] = df.apply(create_word_series, axis=1, args=('label_pred',))\n", "df" ], "metadata": { "id": "R6waU4p-sCcV", "colab": { "base_uri": "https://localhost:8080/", "height": 788 }, "outputId": "27160bed-a1eb-4554-d6ac-95344593885e" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count Split Location \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " Modified text New Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 \n", "2 A decisão foi anunciada depois que o julgament... 192 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 \n", "59887 Existem indicações de que os candidatos da Esp... 95 \n", "\n", " New Char Count id label_pred \\\n", "0 436 0 69 \n", "1 1026 4 17 \n", "2 1229 6 118 \n", "3 723 7 34 \n", "4 751 8 26 \n", "... ... ... ... \n", "59883 1295 119771 77 \n", "59884 512 119774 17 \n", "59885 1459 119779 161 \n", "59886 722 119782 41 \n", "59887 564 119783 26 \n", "\n", " text label_gold diff \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 1 \n", "1 'Eu expliquei nossa posição de que o que é nec... 20 3 \n", "2 A decisão foi anunciada depois que o julgament... 118 0 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 36 2 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 26 0 \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 83 6 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 35 18 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 161 0 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 30 11 \n", "59887 Existem indicações de que os candidatos da Esp... 23 3 \n", "\n", " Token Limit Check WORDS_REAL \\\n", "0 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... ... \n", "59883 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59884 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59885 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59886 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59887 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED \n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... \n", "59883 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59884 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59885 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59886 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59887 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", "[59888 rows x 19 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiffToken Limit CheckWORDS_REALWORDS_PRED
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70436069Se se tratasse de uma fábrica de sapatos ainda...701Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...1431026417'Eu expliquei nossa posição de que o que é nec...203Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...19212296118A decisão foi anunciada depois que o julgament...1180Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118723734O Bankia , criado em 2010 depois da fusão de s...362Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130751826O DVD ‘ Karaokê ’ , também de dezembro , já ve...260Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
............................................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...202129511977177Sherk e Fiona, Pocahontas, a Capuchinho Vermel...836Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...7951211977417'Foi muito construtivo para mim (o estágio de ...3518Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...2421459119779161de 2008 após a queda do 6º andar do edifício L...1610Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...12572211978241Segundo o levantamento, em São Paulo, o preço ...3011Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...9556411978326Existem indicações de que os candidatos da Esp...233Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
\n", "

59888 rows × 19 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 59888,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"POR\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"GPT-4o\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Unchanged\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59881,\n \"samples\": [\n \"De acordo com a Superintend\\u00eancia da Infraero em Manaus, o bimotor Seneca, modelo PA 34, iria para Santar\\u00e9m (PA) e caiu antes de atingir a altitude de cruzeiro. As condi\\u00e7\\u00f5es meteorol\\u00f3gicas, no momento do acidente, eram boas. O aeroporto est\\u00e1 operando normalmente.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 410,\n \"min\": 5,\n \"max\": 12061,\n \"num_unique_values\": 2346,\n \"samples\": [\n 1583\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2501,\n \"min\": 26,\n \"max\": 75639,\n \"num_unique_values\": 7856,\n \"samples\": [\n 6437\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 249,\n \"min\": 1,\n \"max\": 8516,\n \"num_unique_values\": 1669,\n \"samples\": [\n 488\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1509,\n \"min\": 6,\n \"max\": 51592,\n \"num_unique_values\": 5972,\n \"samples\": [\n 3425\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 34619,\n \"min\": 0,\n \"max\": 119783,\n \"num_unique_values\": 59888,\n \"samples\": [\n 14210\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 205,\n \"min\": 0,\n \"max\": 1638,\n \"num_unique_values\": 1404,\n \"samples\": [\n 822\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 59866,\n \"samples\": [\n \"- Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este Continue this text directly in Portuguese : - Queria dizer a todos os prefeitos, aos reeleitos e aos eleitos pela primeira vez, que aqui t\\u00eam sua casa. Considerem este **espa\\u00e7o como um ambiente de di\\u00e1logo permanente, aberto para o debate franco e a constru\\u00e7\\u00e3o de solu\\u00e7\\u00f5es conjuntas para os desafios que enfrentamos. Contem com a nossa parceria e o nosso apoio, independentemente de bandeiras partid\\u00e1rias, pois o nosso objetivo comum \\u00e9 o desenvolvimento e o bem-estar das nossas cidades e dos nossos cidad\\u00e3os.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 241,\n \"min\": 0,\n \"max\": 8403,\n \"num_unique_values\": 1563,\n \"samples\": [\n 2648\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 86,\n \"min\": 0,\n \"max\": 7281,\n \"num_unique_values\": 535,\n \"samples\": [\n 70\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Token Limit Check\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Outside\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORDS_REAL\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORDS_PRED\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "def calculate_metrics(row):\n", " tp = 0\n", " fp = 0\n", " tn = 0\n", " fn = 0\n", " for i in range(len(row['WORDS_REAL'])):\n", " if row['WORDS_REAL'][i] == 1 and row['WORDS_PRED'][i] == 1:\n", " tp += 1\n", " elif row['WORDS_REAL'][i] == 0 and row['WORDS_PRED'][i] == 1:\n", " fp += 1\n", " elif row['WORDS_REAL'][i] == 0 and row['WORDS_PRED'][i] == 0:\n", " tn += 1\n", " elif row['WORDS_REAL'][i] == 1 and row['WORDS_PRED'][i] == 0:\n", " fn += 1\n", " return tp, fp, tn, fn\n", "df[['ROW_TP', 'ROW_FP', 'ROW_TN', 'ROW_FN']] = df.apply(calculate_metrics, axis=1, result_type='expand')\n", "df" ], "metadata": { "id": "WI83u4mjsEvy", "colab": { "base_uri": "https://localhost:8080/", "height": 805 }, "outputId": "14c5c2b9-1251-4d76-b157-cc7e96ce62a0" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count Split Location \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " Modified text New Word Count ... \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 ... \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 ... \n", "2 A decisão foi anunciada depois que o julgament... 192 ... \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 ... \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 ... \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 ... \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 ... \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 ... \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 ... \n", "59887 Existem indicações de que os candidatos da Esp... 95 ... \n", "\n", " text label_gold diff \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 1 \n", "1 'Eu expliquei nossa posição de que o que é nec... 20 3 \n", "2 A decisão foi anunciada depois que o julgament... 118 0 \n", "3 O Bankia , criado em 2010 depois da fusão de s... 36 2 \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 26 0 \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 83 6 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 35 18 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 161 0 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 30 11 \n", "59887 Existem indicações de que os candidatos da Esp... 23 3 \n", "\n", " Token Limit Check WORDS_REAL \\\n", "0 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... ... \n", "59883 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59884 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59885 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59886 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59887 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED ROW_TP ROW_FP ROW_TN \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0 1 69 \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 123 3 17 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 74 0 118 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 82 2 34 \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 104 0 26 \n", "... ... ... ... ... \n", "59883 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 119 6 77 \n", "59884 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 44 18 17 \n", "59885 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 81 0 161 \n", "59886 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 84 0 30 \n", "59887 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 69 0 23 \n", "\n", " ROW_FN \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "59883 0 \n", "59884 0 \n", "59885 0 \n", "59886 11 \n", "59887 3 \n", "\n", "[59888 rows x 23 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...textlabel_golddiffToken Limit CheckWORDS_REALWORDS_PREDROW_TPROW_FPROW_TNROW_FN
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70...Se se tratasse de uma fábrica de sapatos ainda...701Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...01690
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...143...'Eu expliquei nossa posição de que o que é nec...203Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...1233170
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...192...A decisão foi anunciada depois que o julgament...1180Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...7401180
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118...O Bankia , criado em 2010 depois da fusão de s...362Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...822340
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130...O DVD ‘ Karaokê ’ , também de dezembro , já ve...260Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...1040260
..................................................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...202...Sherk e Fiona, Pocahontas, a Capuchinho Vermel...836Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...1196770
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...79...'Foi muito construtivo para mim (o estágio de ...3518Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...4418170
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...242...de 2008 após a queda do 6º andar do edifício L...1610Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...8101610
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...125...Segundo o levantamento, em São Paulo, o preço ...3011Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...8403011
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...95...Existem indicações de que os candidatos da Esp...233Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...690233
\n", "

59888 rows × 23 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "def calculate_row_metrics(row):\n", " tp = row['ROW_TP']\n", " fp = row['ROW_FP']\n", " tn = row['ROW_TN']\n", " fn = row['ROW_FN']\n", " if (tp + tn + fp + fn) == 0:\n", " accuracy = 0\n", " else:\n", " accuracy = (tp + tn) / (tp + tn + fp + fn)\n", " if (tp + fp) == 0:\n", " precision = 0\n", " else:\n", " precision = tp / (tp + fp)\n", " if (tp + fn) == 0:\n", " recall = 0\n", " else:\n", " recall = tp / (tp + fn)\n", " if (precision + recall) == 0:\n", " f1_score = 0\n", " else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " return accuracy, precision, recall, f1_score\n", "df[['ROW_ACC', 'ROW_PREC', 'ROW_REC', 'ROW_F1']] = df.apply(calculate_row_metrics, axis=1, result_type='expand')\n", "df" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 666 }, "id": "6PnV_NwCsJNG", "outputId": "88092b45-06b7-4db4-eeb5-fa1c71f5dd4c" }, "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count Split Location \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " Modified text New Word Count ... \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 ... \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 ... \n", "2 A decisão foi anunciada depois que o julgament... 192 ... \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 ... \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 ... \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 ... \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 ... \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 ... \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 ... \n", "59887 Existem indicações de que os candidatos da Esp... 95 ... \n", "\n", " WORDS_REAL \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "... ... \n", "59883 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59884 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59885 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59886 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "59887 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED ROW_TP ROW_FP \\\n", "0 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0 1 \n", "1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 123 3 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 74 0 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 82 2 \n", "4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 104 0 \n", "... ... ... ... \n", "59883 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 119 6 \n", "59884 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 44 18 \n", "59885 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 81 0 \n", "59886 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 84 0 \n", "59887 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 69 0 \n", "\n", " ROW_TN ROW_FN ROW_ACC ROW_PREC ROW_REC ROW_F1 \n", "0 69 0 0.985714 0.000000 0.000000 0.000000 \n", "1 17 0 0.979021 0.976190 1.000000 0.987952 \n", "2 118 0 1.000000 1.000000 1.000000 1.000000 \n", "3 34 0 0.983051 0.976190 1.000000 0.987952 \n", "4 26 0 1.000000 1.000000 1.000000 1.000000 \n", "... ... ... ... ... ... ... \n", "59883 77 0 0.970297 0.952000 1.000000 0.975410 \n", "59884 17 0 0.772152 0.709677 1.000000 0.830189 \n", "59885 161 0 1.000000 1.000000 1.000000 1.000000 \n", "59886 30 11 0.912000 1.000000 0.884211 0.938547 \n", "59887 23 3 0.968421 1.000000 0.958333 0.978723 \n", "\n", "[59888 rows x 27 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...WORDS_REALWORDS_PREDROW_TPROW_FPROW_TNROW_FNROW_ACCROW_PRECROW_RECROW_F1
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...016900.9857140.0000000.0000000.000000
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...143...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...12331700.9790210.9761901.0000000.987952
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...192...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...74011801.0000001.0000001.0000001.000000
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...8223400.9830510.9761901.0000000.987952
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...10402601.0000001.0000001.0000001.000000
..................................................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...202...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...11967700.9702970.9520001.0000000.975410
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...79...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...44181700.7721520.7096771.0000000.830189
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...242...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...81016101.0000001.0000001.0000001.000000
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...125...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...84030110.9120001.0000000.8842110.938547
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...95...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...6902330.9684211.0000000.9583330.978723
\n", "

59888 rows × 27 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "def calculate_percentage_of_ones(row):\n", " series = row['WORDS_PRED']\n", " if len(series) == 0:\n", " return 0\n", " else:\n", " return sum(series) / len(series)\n", "df[\"Label : 1\"] = df.apply(calculate_percentage_of_ones, axis=1)\n", "df[\"Label : 0\"] = 1.0 - df[\"Label : 1\"]\n", "df" ], "metadata": { "id": "Yp3FO_HVsLiA", "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "outputId": "b89560ff-661b-4f1d-8a2b-dc43d302e887" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 POR PPLX-Sonar-Large Unchanged Test \n", "1 POR Amazon-Nova-Pro-1.0 Partial Test \n", "2 POR Claude-Sonnet-3.5 Partial Test \n", "3 POR GPT-o1 Partial Test \n", "4 POR GPT-o1 Partial Test \n", "... ... ... ... ... \n", "59883 POR PPLX-Sonar-Large Partial Test \n", "59884 POR GPT-4o Partial Test \n", "59885 POR Claude-Haiku-3.5 Partial Test \n", "59886 POR Aya-23 Partial Test \n", "59887 POR Aya-23 Partial Test \n", "\n", " Original text Original Word Count \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 \n", "1 'Eu expliquei nossa posição de que o que é nec... 57 \n", "2 A decisão foi anunciada depois que o julgament... 191 \n", "3 O Bankia, criado em 2010 depois da fusão de se... 76 \n", "4 O DVD ‘Karaokê’, também de dezembro, já vendeu... 63 \n", "... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 131 \n", "59884 'Foi muito construtivo para mim (o estágio de ... 56 \n", "59885 de 2008 após a queda do 6º andar do edifício L... 433 \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 61 \n", "59887 Existem indicações de que os candidatos da Esp... 48 \n", "\n", " Original Char Count Split Location \\\n", "0 436 70 \n", "1 366 20 \n", "2 1217 118 \n", "3 435 36 \n", "4 331 26 \n", "... ... ... \n", "59883 856 83 \n", "59884 334 35 \n", "59885 2645 161 \n", "59886 331 30 \n", "59887 284 23 \n", "\n", " Modified text New Word Count ... \\\n", "0 Se se tratasse de uma fábrica de sapatos ainda... 70 ... \n", "1 'Eu expliquei nossa posição de que o que é nec... 143 ... \n", "2 A decisão foi anunciada depois que o julgament... 192 ... \n", "3 O Bankia , criado em 2010 depois da fusão de s... 118 ... \n", "4 O DVD ‘ Karaokê ’ , também de dezembro , já ve... 130 ... \n", "... ... ... ... \n", "59883 Sherk e Fiona, Pocahontas, a Capuchinho Vermel... 202 ... \n", "59884 'Foi muito construtivo para mim (o estágio de ... 79 ... \n", "59885 de 2008 após a queda do 6º andar do edifício L... 242 ... \n", "59886 Segundo o levantamento, em São Paulo, o preço ... 125 ... \n", "59887 Existem indicações de que os candidatos da Esp... 95 ... \n", "\n", " ROW_TP ROW_FP ROW_TN ROW_FN ROW_ACC ROW_PREC ROW_REC ROW_F1 \\\n", "0 0 1 69 0 0.985714 0.000000 0.000000 0.000000 \n", "1 123 3 17 0 0.979021 0.976190 1.000000 0.987952 \n", "2 74 0 118 0 1.000000 1.000000 1.000000 1.000000 \n", "3 82 2 34 0 0.983051 0.976190 1.000000 0.987952 \n", "4 104 0 26 0 1.000000 1.000000 1.000000 1.000000 \n", "... ... ... ... ... ... ... ... ... \n", "59883 119 6 77 0 0.970297 0.952000 1.000000 0.975410 \n", "59884 44 18 17 0 0.772152 0.709677 1.000000 0.830189 \n", "59885 81 0 161 0 1.000000 1.000000 1.000000 1.000000 \n", "59886 84 0 30 11 0.912000 1.000000 0.884211 0.938547 \n", "59887 69 0 23 3 0.968421 1.000000 0.958333 0.978723 \n", "\n", " Label : 1 Label : 0 \n", "0 0.014286 0.985714 \n", "1 0.881119 0.118881 \n", "2 0.385417 0.614583 \n", "3 0.711864 0.288136 \n", "4 0.800000 0.200000 \n", "... ... ... \n", "59883 0.618812 0.381188 \n", "59884 0.784810 0.215190 \n", "59885 0.334711 0.665289 \n", "59886 0.672000 0.328000 \n", "59887 0.726316 0.273684 \n", "\n", "[59888 rows x 29 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...ROW_TPROW_FPROW_TNROW_FNROW_ACCROW_PRECROW_RECROW_F1Label : 1Label : 0
0PORPPLX-Sonar-LargeUnchangedTestSe se tratasse de uma fábrica de sapatos ainda...7043670Se se tratasse de uma fábrica de sapatos ainda...70...016900.9857140.0000000.0000000.0000000.0142860.985714
1PORAmazon-Nova-Pro-1.0PartialTest'Eu expliquei nossa posição de que o que é nec...5736620'Eu expliquei nossa posição de que o que é nec...143...12331700.9790210.9761901.0000000.9879520.8811190.118881
2PORClaude-Sonnet-3.5PartialTestA decisão foi anunciada depois que o julgament...1911217118A decisão foi anunciada depois que o julgament...192...74011801.0000001.0000001.0000001.0000000.3854170.614583
3PORGPT-o1PartialTestO Bankia, criado em 2010 depois da fusão de se...7643536O Bankia , criado em 2010 depois da fusão de s...118...8223400.9830510.9761901.0000000.9879520.7118640.288136
4PORGPT-o1PartialTestO DVD ‘Karaokê’, também de dezembro, já vendeu...6333126O DVD ‘ Karaokê ’ , também de dezembro , já ve...130...10402601.0000001.0000001.0000001.0000000.8000000.200000
..................................................................
59883PORPPLX-Sonar-LargePartialTestSherk e Fiona, Pocahontas, a Capuchinho Vermel...13185683Sherk e Fiona, Pocahontas, a Capuchinho Vermel...202...11967700.9702970.9520001.0000000.9754100.6188120.381188
59884PORGPT-4oPartialTest'Foi muito construtivo para mim (o estágio de ...5633435'Foi muito construtivo para mim (o estágio de ...79...44181700.7721520.7096771.0000000.8301890.7848100.215190
59885PORClaude-Haiku-3.5PartialTestde 2008 após a queda do 6º andar do edifício L...4332645161de 2008 após a queda do 6º andar do edifício L...242...81016101.0000001.0000001.0000001.0000000.3347110.665289
59886PORAya-23PartialTestSegundo o levantamento, em São Paulo, o preço ...6133130Segundo o levantamento, em São Paulo, o preço ...125...84030110.9120001.0000000.8842110.9385470.6720000.328000
59887PORAya-23PartialTestExistem indicações de que os candidatos da Esp...4828423Existem indicações de que os candidatos da Esp...95...6902330.9684211.0000000.9583330.9787230.7263160.273684
\n", "

59888 rows × 29 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "df_unchanged = df[df['Type'] == 'Unchanged']\n", "df_rewritten = df[df['Type'] == 'Rewritten']\n", "df_partial = df[df['Type'] == 'Partial']\n", "print(\"######################################\")\n", "print(\" METRICS BY TEXT TYPE : \")\n", "print(\"######################################\")\n", "AVG_ACC = df_partial['ROW_ACC'].mean()\n", "AVG_PREC = df_partial['ROW_PREC'].mean()\n", "AVG_REC = df_partial['ROW_REC'].mean()\n", "AVG_F1 = df_partial['ROW_F1'].mean()\n", "print(\"Partial Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Partial Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Partial Cases : Average Recall : \" , AVG_REC )\n", "print(\"Partial Cases : Average F1-score : \" , AVG_F1 )\n", "AVG_ACC = df_unchanged['ROW_ACC'].mean()\n", "AVG_PREC = df_unchanged['ROW_PREC'].mean()\n", "AVG_REC = df_unchanged['ROW_REC'].mean()\n", "AVG_F1 = df_unchanged['ROW_F1'].mean()\n", "print(\"Unchanged Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Unchanged Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Unchanged Cases : Average Recall : \" , AVG_REC )\n", "print(\"Unchanged Cases : Average F1-score : \" , AVG_F1 )\n", "AVG_ACC = df_rewritten['ROW_ACC'].mean()\n", "AVG_PREC = df_rewritten['ROW_PREC'].mean()\n", "AVG_REC = df_rewritten['ROW_REC'].mean()\n", "AVG_F1 = df_rewritten['ROW_F1'].mean()\n", "print(\"Rewritten Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Rewritten Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Rewritten Cases : Average Recall : \" , AVG_REC )\n", "print(\"Rewritten Cases : Average F1-score : \" , AVG_F1 )\n", "print(\"######################################\")\n", "print(\" METRICS OVERALL : \")\n", "print(\"######################################\")\n", "# prompt: print AVG_ACC, AVG_PREC , AVG_REC , AVG_F1 as mean of values in columns ROW_ACC , ROW_REC , ROW_PREC , ROW_F1 from dataframe df\n", "AVG_ACC = df['ROW_ACC'].mean()\n", "AVG_PREC = df['ROW_PREC'].mean()\n", "AVG_REC = df['ROW_REC'].mean()\n", "AVG_F1 = df['ROW_F1'].mean()\n", "print(\"All Cases : Average Accuracy:\", AVG_ACC)\n", "print(\"All Cases : Average Precision:\", AVG_PREC)\n", "print(\"All Cases : Average Recall:\", AVG_REC)\n", "print(\"All Cases : Average F1-score:\", AVG_F1)\n", "print(\"######################################\")\n", "# prompt: Also print overall ACC,PREC,REC,F1 based on values of columns ROW_TN,ROW_TP,ROW_FN,ROW_FP\n", "total_tp = df['ROW_TP'].sum()\n", "total_fp = df['ROW_FP'].sum()\n", "total_tn = df['ROW_TN'].sum()\n", "total_fn = df['ROW_FN'].sum()\n", "if (total_tp + total_tn + total_fp + total_fn) == 0:\n", " accuracy = 0\n", "else:\n", " accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)\n", "if (total_tp + total_fp) == 0:\n", " precision = 0\n", "else:\n", " precision = total_tp / (total_tp + total_fp)\n", "if (total_tp + total_fn) == 0:\n", " recall = 0\n", "else:\n", " recall = total_tp / (total_tp + total_fn)\n", "if (precision + recall) == 0:\n", " f1_score = 0\n", "else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", "print(\"Overall Accuracy:\", accuracy)\n", "print(\"Overall Precision:\", precision)\n", "print(\"Overall Recall:\", recall)\n", "print(\"Overall F1-score:\", f1_score)" ], "metadata": { "id": "cuuc9gPjsU_T", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "53a90fd8-5259-44f4-a38b-c4b1dca331c5" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "######################################\n", " METRICS BY TEXT TYPE : \n", "######################################\n", "Partial Cases : Average Accuracy : 0.9660241365293537\n", "Partial Cases : Average Precision : 0.9615767453004874\n", "Partial Cases : Average Recall : 0.9736199776056205\n", "Partial Cases : Average F1-score : 0.9585118099490816\n", "Unchanged Cases : Average Accuracy : 0.8473605153861489\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9323714208489493\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9323714208489493\n", "Rewritten Cases : Average F1-score : 0.9507238653530005\n", "######################################\n", " METRICS OVERALL : \n", "######################################\n", "All Cases : Average Accuracy: 0.950815773340835\n", "All Cases : Average Precision: 0.869102933343314\n", "All Cases : Average Recall: 0.8720620805505125\n", "All Cases : Average F1-score: 0.8617752847628559\n", "######################################\n", "Overall Accuracy: 0.9450566222508892\n", "Overall Precision: 0.9023369780446043\n", "Overall Recall: 0.9722911860372029\n", "Overall F1-score: 0.9360088655426044\n" ] } ] }, { "cell_type": "code", "source": [ "llm_values = df['LLM used'].unique()\n", "for llm in llm_values:\n", " print(\"LLM used:\", llm)\n", " df_llm = df[df['LLM used'] == llm]\n", " for text_type in ['Partial', 'Unchanged', 'Rewritten']:\n", " df_subset = df_llm[df_llm['Type'] == text_type]\n", " if df_subset.empty:\n", " continue\n", " avg_acc = df_subset['ROW_ACC'].mean()\n", " avg_prec = df_subset['ROW_PREC'].mean()\n", " avg_rec = df_subset['ROW_REC'].mean()\n", " avg_f1 = df_subset['ROW_F1'].mean()\n", " print(f\"{text_type} Cases : Average Accuracy : {avg_acc}\")\n", " print(f\"{text_type} Cases : Average Precision : {avg_prec}\")\n", " print(f\"{text_type} Cases : Average Recall : {avg_rec}\")\n", " print(f\"{text_type} Cases : Average F1-score : {avg_f1}\")\n", " print(\"######################################\")" ], "metadata": { "id": "9PwzmDF9xJzl", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "ba8985e9-23c3-4045-dee8-3a4cb3fabcf6" }, "execution_count": 21, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LLM used: PPLX-Sonar-Large\n", "Partial Cases : Average Accuracy : 0.9336572794715781\n", "Partial Cases : Average Precision : 0.9592279963612549\n", "Partial Cases : Average Recall : 0.9309725090136654\n", "Partial Cases : Average F1-score : 0.9296411128898119\n", "Unchanged Cases : Average Accuracy : 0.8650579359227606\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9875355565280338\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9875355565280338\n", "Rewritten Cases : Average F1-score : 0.9916950226205159\n", "######################################\n", "LLM used: Amazon-Nova-Pro-1.0\n", "Partial Cases : Average Accuracy : 0.9709451689144404\n", "Partial Cases : Average Precision : 0.9692946278654408\n", "Partial Cases : Average Recall : 0.9818457633020391\n", "Partial Cases : Average F1-score : 0.9721092096053271\n", "Unchanged Cases : Average Accuracy : 0.8539898250003753\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.912023254588777\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.912023254588777\n", "Rewritten Cases : Average F1-score : 0.9346390861707342\n", "######################################\n", "LLM used: Claude-Sonnet-3.5\n", "Partial Cases : Average Accuracy : 0.9913023042595986\n", "Partial Cases : Average Precision : 0.9879208373559784\n", "Partial Cases : Average Recall : 0.9972037804425087\n", "Partial Cases : Average F1-score : 0.9902393387651143\n", "Unchanged Cases : Average Accuracy : 0.846800778783857\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9968475122215691\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9968475122215691\n", "Rewritten Cases : Average F1-score : 0.9975994361761628\n", "######################################\n", "LLM used: GPT-o1\n", "Partial Cases : Average Accuracy : 0.9875262733939048\n", "Partial Cases : Average Precision : 0.9839249255483435\n", "Partial Cases : Average Recall : 0.9932274727630463\n", "Partial Cases : Average F1-score : 0.9867913554825289\n", "Unchanged Cases : Average Accuracy : 0.8487406544030699\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.8816725505205283\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.8816725505205283\n", "Rewritten Cases : Average F1-score : 0.9182960571999935\n", "######################################\n", "LLM used: Mistral-Large-2411\n", "Partial Cases : Average Accuracy : 0.9624331896866248\n", "Partial Cases : Average Precision : 0.9356172199508818\n", "Partial Cases : Average Recall : 0.9884701479342386\n", "Partial Cases : Average F1-score : 0.9559254452870719\n", "Unchanged Cases : Average Accuracy : 0.8480262703769613\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9616318973693257\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9616318973693257\n", "Rewritten Cases : Average F1-score : 0.971613970701231\n", "######################################\n", "LLM used: Gemini-Flash-1.5\n", "Partial Cases : Average Accuracy : 0.9841269497179013\n", "Partial Cases : Average Precision : 0.9709423218373346\n", "Partial Cases : Average Recall : 0.9911900538961763\n", "Partial Cases : Average F1-score : 0.9742277702179398\n", "Unchanged Cases : Average Accuracy : 0.8258920595884512\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9914367111505826\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9914367111505826\n", "Rewritten Cases : Average F1-score : 0.99382686178664\n", "######################################\n", "LLM used: Command-R-Plus\n", "Partial Cases : Average Accuracy : 0.9695736975481161\n", "Partial Cases : Average Precision : 0.968999007842508\n", "Partial Cases : Average Recall : 0.9822901406275538\n", "Partial Cases : Average F1-score : 0.9715472453927925\n", "Unchanged Cases : Average Accuracy : 0.8408387261986766\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9273372760043561\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9273372760043561\n", "Rewritten Cases : Average F1-score : 0.9528583247915982\n", "######################################\n", "LLM used: Aya-23\n", "Partial Cases : Average Accuracy : 0.9151202982507924\n", "Partial Cases : Average Precision : 0.9067503175246787\n", "Partial Cases : Average Recall : 0.9039410653821318\n", "Partial Cases : Average F1-score : 0.862980083934799\n", "Unchanged Cases : Average Accuracy : 0.8620410631723532\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9187449129579435\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9187449129579435\n", "Rewritten Cases : Average F1-score : 0.9442432288035578\n", "######################################\n", "LLM used: Amazon-Nova-Lite-1.0\n", "Partial Cases : Average Accuracy : 0.9763954313612049\n", "Partial Cases : Average Precision : 0.9798847709468205\n", "Partial Cases : Average Recall : 0.9790540517342914\n", "Partial Cases : Average F1-score : 0.9758634105164036\n", "Unchanged Cases : Average Accuracy : 0.8213296969712184\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.8751641548335676\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.8751641548335676\n", "Rewritten Cases : Average F1-score : 0.9009799649627932\n", "######################################\n", "LLM used: Gemini-Pro-1.5\n", "Partial Cases : Average Accuracy : 0.9561886169191535\n", "Partial Cases : Average Precision : 0.9501778800749937\n", "Partial Cases : Average Recall : 0.9647885203762692\n", "Partial Cases : Average F1-score : 0.943518778111498\n", "Unchanged Cases : Average Accuracy : 0.8425220262180342\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.983617841815246\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.983617841815246\n", "Rewritten Cases : Average F1-score : 0.9884636709208505\n", "######################################\n", "LLM used: GPT-4o\n", "Partial Cases : Average Accuracy : 0.9517158986007322\n", "Partial Cases : Average Precision : 0.9417001920326047\n", "Partial Cases : Average Recall : 0.9741252207903043\n", "Partial Cases : Average F1-score : 0.9525049718414006\n", "Unchanged Cases : Average Accuracy : 0.8478907450886665\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.8923100804167152\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.8923100804167152\n", "Rewritten Cases : Average F1-score : 0.9171247271361443\n", "######################################\n", "LLM used: Claude-Haiku-3.5\n", "Partial Cases : Average Accuracy : 0.9930574873033187\n", "Partial Cases : Average Precision : 0.9844627962019611\n", "Partial Cases : Average Recall : 0.9959477113264821\n", "Partial Cases : Average F1-score : 0.9865154430866453\n", "Unchanged Cases : Average Accuracy : 0.8649229746061408\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.8510984640669441\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.8510984640669441\n", "Rewritten Cases : Average F1-score : 0.8906838392926133\n", "######################################\n" ] } ] }, { "cell_type": "code", "source": [ "llm_values = df['LLM used'].unique()\n", "for llm in llm_values:\n", " print(\"LLM used:\", llm)\n", " df_llm = df[df['LLM used'] == llm]\n", " total_tp = df_llm['ROW_TP'].sum()\n", " total_fp = df_llm['ROW_FP'].sum()\n", " total_tn = df_llm['ROW_TN'].sum()\n", " total_fn = df_llm['ROW_FN'].sum()\n", " if (total_tp + total_tn + total_fp + total_fn) == 0:\n", " accuracy = 0\n", " else:\n", " accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)\n", " if (total_tp + total_fp) == 0:\n", " precision = 0\n", " else:\n", " precision = total_tp / (total_tp + total_fp)\n", " if (total_tp + total_fn) == 0:\n", " recall = 0\n", " else:\n", " recall = total_tp / (total_tp + total_fn)\n", " if (precision + recall) == 0:\n", " f1_score = 0\n", " else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " print(\"Overall Accuracy:\", accuracy)\n", " print(\"Overall Precision:\", precision)\n", " print(\"Overall Recall:\", recall)\n", " print(\"Overall F1-score:\", f1_score)\n", " print(\"######################################\")" ], "metadata": { "id": "02ubnS2dxq1x", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "5766ebc4-e9d2-414c-9165-3ca1523de46b" }, "execution_count": 22, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LLM used: PPLX-Sonar-Large\n", "Overall Accuracy: 0.9366108591040816\n", "Overall Precision: 0.9125891723509473\n", "Overall Recall: 0.9497106397668453\n", "Overall F1-score: 0.9307799325094863\n", "######################################\n", "LLM used: Amazon-Nova-Pro-1.0\n", "Overall Accuracy: 0.948402978260228\n", "Overall Precision: 0.9228828332103408\n", "Overall Recall: 0.9758824982802415\n", "Overall F1-score: 0.9486429838975832\n", "######################################\n", "LLM used: Claude-Sonnet-3.5\n", "Overall Accuracy: 0.9574911523031637\n", "Overall Precision: 0.8999167948486876\n", "Overall Recall: 0.997233910992673\n", "Overall F1-score: 0.9460793411211376\n", "######################################\n", "LLM used: GPT-o1\n", "Overall Accuracy: 0.9501999198499007\n", "Overall Precision: 0.9129089002401813\n", "Overall Recall: 0.9745004540916091\n", "Overall F1-score: 0.9426997230744241\n", "######################################\n", "LLM used: Mistral-Large-2411\n", "Overall Accuracy: 0.9383168759869477\n", "Overall Precision: 0.8595181566763317\n", "Overall Recall: 0.9856545940439103\n", "Overall F1-score: 0.9182750172974208\n", "######################################\n", "LLM used: Gemini-Flash-1.5\n", "Overall Accuracy: 0.9489819515683161\n", "Overall Precision: 0.8932211950865856\n", "Overall Recall: 0.9911052274302997\n", "Overall F1-score: 0.9396208481961376\n", "######################################\n", "LLM used: Command-R-Plus\n", "Overall Accuracy: 0.9571001458727652\n", "Overall Precision: 0.9267163647915617\n", "Overall Recall: 0.9699475751310622\n", "Overall F1-score: 0.9478392792141324\n", "######################################\n", "LLM used: Aya-23\n", "Overall Accuracy: 0.9272542758842602\n", "Overall Precision: 0.8858691458116764\n", "Overall Recall: 0.9439138646977308\n", "Overall F1-score: 0.9139708525403626\n", "######################################\n", "LLM used: Amazon-Nova-Lite-1.0\n", "Overall Accuracy: 0.9418637135748584\n", "Overall Precision: 0.9201736418230064\n", "Overall Recall: 0.9689753849736202\n", "Overall F1-score: 0.9439441739966153\n", "######################################\n", "LLM used: Gemini-Pro-1.5\n", "Overall Accuracy: 0.942350795400236\n", "Overall Precision: 0.8866794177417724\n", "Overall Recall: 0.9751917757652068\n", "Overall F1-score: 0.9288316817377275\n", "######################################\n", "LLM used: GPT-4o\n", "Overall Accuracy: 0.9316655070029065\n", "Overall Precision: 0.8668993884152754\n", "Overall Recall: 0.965425494749963\n", "Overall F1-score: 0.9135135135135135\n", "######################################\n", "LLM used: Claude-Haiku-3.5\n", "Overall Accuracy: 0.9604860726448516\n", "Overall Precision: 0.9124943529992157\n", "Overall Recall: 0.9807886513329381\n", "Overall F1-score: 0.9454097499203157\n", "######################################\n" ] } ] }, { "cell_type": "code", "source": [ "df.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mr8VT4S9whud", "outputId": "a4736d70-b83f-4b00-f0cc-196b033365a1" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 59888 entries, 0 to 59887\n", "Data columns (total 29 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 ISO 59888 non-null object \n", " 1 LLM used 59888 non-null object \n", " 2 Type 59888 non-null object \n", " 3 Data Split 59888 non-null object \n", " 4 Original text 59888 non-null object \n", " 5 Original Word Count 59888 non-null int64 \n", " 6 Original Char Count 59888 non-null int64 \n", " 7 Split Location 59888 non-null int64 \n", " 8 Modified text 59888 non-null object \n", " 9 New Word Count 59888 non-null int64 \n", " 10 New Char Count 59888 non-null int64 \n", " 11 id 59888 non-null int64 \n", " 12 label_pred 59888 non-null int64 \n", " 13 text 59888 non-null object \n", " 14 label_gold 59888 non-null int64 \n", " 15 diff 59888 non-null int64 \n", " 16 Token Limit Check 59888 non-null object \n", " 17 WORDS_REAL 59888 non-null object \n", " 18 WORDS_PRED 59888 non-null object \n", " 19 ROW_TP 59888 non-null int64 \n", " 20 ROW_FP 59888 non-null int64 \n", " 21 ROW_TN 59888 non-null int64 \n", " 22 ROW_FN 59888 non-null int64 \n", " 23 ROW_ACC 59888 non-null float64\n", " 24 ROW_PREC 59888 non-null float64\n", " 25 ROW_REC 59888 non-null float64\n", " 26 ROW_F1 59888 non-null float64\n", " 27 Label : 1 59888 non-null float64\n", " 28 Label : 0 59888 non-null float64\n", "dtypes: float64(6), int64(13), object(10)\n", "memory usage: 13.3+ MB\n" ] } ] }, { "cell_type": "code", "source": [ "df.to_csv(\"POR-INFERENCE-3.csv\")############################################################################################################################################################################################################" ], "metadata": { "id": "AEEjGzfkw0le" }, "execution_count": 24, "outputs": [] }, { "cell_type": "code", "source": [ "print(\"CLICK CTRL+S, WAIT 2 SEC FOR IT TO BE SAVED, DOWNLOAD BOTH CODE AND THE CSV FILE FROM RUNTIME\")" ], "metadata": { "id": "pno4DPr0Kl8X", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a9f51818-e6c7-4124-e741-bab3e92814f0" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CLICK CTRL+S, WAIT 2 SEC FOR IT TO BE SAVED, DOWNLOAD BOTH CODE AND THE CSV FILE FROM RUNTIME\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "G-TXErXEOXWy" }, "execution_count": 25, "outputs": [] } ] }