{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "machine_shape": "hm", "gpuType": "A100" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU", "widgets": { "application/vnd.jupyter.widget-state+json": { "21330d77a6544f1aacf16e3d6608dab5": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [], "layout": "IPY_MODEL_1bbdbe20067e4286825c547ddd8fd7ac" } }, "56221ae606894685b38769e2963c7e74": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_394ccc1b0b9f4c0fbf76e8b9488da9b2", "placeholder": "​", "style": "IPY_MODEL_69df7b96610f4cbfaf170db215dceae6", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "ec4aaf7003294820a32d0cabcac0d5bc": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_dd30d3e4bb594d98a5a64d9375cf5b36", "placeholder": "​", "style": "IPY_MODEL_393c9d36bda74ae5b2dbf62444070193", "value": "" } }, "e9c0c88f16a94f19a9c709d9890b8e24": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_32ec1e08113c49e382e3a71bf94416ed", "style": "IPY_MODEL_914fc800d68b4446a55d6468a74a111e", "value": true } }, "876a0dfa27bb4bc885059ba98aed0a97": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_d629f6e63271444c9eac198d8b943d56", "style": "IPY_MODEL_96fca2e9f7474c81a2fcf0b5d409d075", "tooltip": "" } }, "e1e09a1b9df04a18ad1fbc4dd35a3c0e": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_64d20184b5d84978ac884a187d48b696", "placeholder": "​", "style": "IPY_MODEL_ee421c04a5c64bc286a7dfa9cc1100e0", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "1bbdbe20067e4286825c547ddd8fd7ac": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "394ccc1b0b9f4c0fbf76e8b9488da9b2": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "69df7b96610f4cbfaf170db215dceae6": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "dd30d3e4bb594d98a5a64d9375cf5b36": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "393c9d36bda74ae5b2dbf62444070193": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "32ec1e08113c49e382e3a71bf94416ed": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "914fc800d68b4446a55d6468a74a111e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "d629f6e63271444c9eac198d8b943d56": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "96fca2e9f7474c81a2fcf0b5d409d075": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "64d20184b5d84978ac884a187d48b696": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ee421c04a5c64bc286a7dfa9cc1100e0": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bc1961f16c3545b5812bdcbf538aac5a": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c7ef5d827e264f51b0d9220c0eebaba3", "placeholder": "​", "style": "IPY_MODEL_74383dba69c34509a937ed56073826be", "value": "Connecting..." } }, "c7ef5d827e264f51b0d9220c0eebaba3": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "74383dba69c34509a937ed56073826be": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "46b3179893e04e5ab4eb6e0967848b29": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_b521ac3bdb5e4c7ea3841ab17058e0d4", "IPY_MODEL_bbdbb4bcd9bb4dd9a07d13eab2e98c8b", "IPY_MODEL_f33364ef9aed41708d0e299f65908c28" ], "layout": "IPY_MODEL_5d57450f4992496cbc1ba1397388e7d0" } }, "b521ac3bdb5e4c7ea3841ab17058e0d4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ada58e7c96ba4749a87bd7520255fefa", "placeholder": "​", "style": "IPY_MODEL_88055b608df2435182e374b1e487c4ff", "value": "config.json: 100%" } }, "bbdbb4bcd9bb4dd9a07d13eab2e98c8b": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_69ed06e493184af08fe9a85b36939914", "max": 772, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_4d3cf3fb6a8848efb8bb5a4a113ff95e", "value": 772 } }, "f33364ef9aed41708d0e299f65908c28": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_4ad2c43f06fb47e48dd2bfa66cc63480", "placeholder": "​", "style": "IPY_MODEL_2713a4657b0d4e38a33ba96a782e3674", "value": " 772/772 [00:00<00:00, 74.4kB/s]" } }, "5d57450f4992496cbc1ba1397388e7d0": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "ada58e7c96ba4749a87bd7520255fefa": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "88055b608df2435182e374b1e487c4ff": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "69ed06e493184af08fe9a85b36939914": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4d3cf3fb6a8848efb8bb5a4a113ff95e": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "4ad2c43f06fb47e48dd2bfa66cc63480": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2713a4657b0d4e38a33ba96a782e3674": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "aec339a98bb44b8faa5f44dd0564fea0": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_f86c420a0a134d468c07fd2329c579d3", "IPY_MODEL_d23251854d8b4dddb5aeab7f017b503d", "IPY_MODEL_60a889e56e774a2f9b04a1abeeac5729" ], "layout": "IPY_MODEL_92c1f3a6e5da40bfba6524de99428481" } }, "f86c420a0a134d468c07fd2329c579d3": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_230b7df7576441a5bbe91022e219327c", "placeholder": "​", "style": "IPY_MODEL_639900294423423aa45c7697704855c7", "value": "tf_model.h5: 100%" } }, "d23251854d8b4dddb5aeab7f017b503d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_297b487addb841c785eb372691163121", "max": 1246320936, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_62a7c80684444f88bc1ec110fc99c197", "value": 1246320936 } }, "60a889e56e774a2f9b04a1abeeac5729": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_2f5a2dafd58d45db84cfb73d4c11487e", "placeholder": "​", "style": "IPY_MODEL_e01e1ee0c58945fb82f124ff1537d148", "value": " 1.25G/1.25G [00:05<00:00, 231MB/s]" } }, "92c1f3a6e5da40bfba6524de99428481": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "230b7df7576441a5bbe91022e219327c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "639900294423423aa45c7697704855c7": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "297b487addb841c785eb372691163121": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "62a7c80684444f88bc1ec110fc99c197": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "2f5a2dafd58d45db84cfb73d4c11487e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e01e1ee0c58945fb82f124ff1537d148": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f381116834034fa3a2ecca34096571e6": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_aea561053459432eb93b6761f8260893", "IPY_MODEL_b76009f6e47d473f809bd3f79d04843d", "IPY_MODEL_8b9ef738e7e54c409365ac4407164de0" ], "layout": "IPY_MODEL_e9a64c46f4d5418080b28ffcf7b7e7a6" } }, "aea561053459432eb93b6761f8260893": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_8f6b13c8b0f044c7a255c7834e4c5cd9", "placeholder": "​", "style": "IPY_MODEL_9837ccc7a7174fffa25ce2239874fc98", "value": "tokenizer_config.json: 100%" } }, "b76009f6e47d473f809bd3f79d04843d": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_97d7ba4acf2f487db73d73aeddbca24b", "max": 453, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_a2509d0e6f874f029efbc29058ad20bd", "value": 453 } }, "8b9ef738e7e54c409365ac4407164de0": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_c476b6a4c90b4c35afc776bc2cecef0b", "placeholder": "​", "style": "IPY_MODEL_2bbbafc9bbdf49f8a24db7ac3ce4207d", "value": " 453/453 [00:00<00:00, 43.3kB/s]" } }, "e9a64c46f4d5418080b28ffcf7b7e7a6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "8f6b13c8b0f044c7a255c7834e4c5cd9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9837ccc7a7174fffa25ce2239874fc98": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "97d7ba4acf2f487db73d73aeddbca24b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "a2509d0e6f874f029efbc29058ad20bd": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "c476b6a4c90b4c35afc776bc2cecef0b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2bbbafc9bbdf49f8a24db7ac3ce4207d": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "cc903da67549457687d7ee2a9db16f4d": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_909c9d1828574fedb91055afba368bf4", "IPY_MODEL_4aeeaf4b71a04fd08a3f63f8353ba1c0", "IPY_MODEL_fb955a70314d4ef28c53191d33c49dad" ], "layout": "IPY_MODEL_5ebb1ce43ecf4e609faaad857870f593" } }, "909c9d1828574fedb91055afba368bf4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_af724ae3920b487690b076f3f9c5b884", "placeholder": "​", "style": "IPY_MODEL_c800ff7792ba4a6b989a9e048a96a1bb", "value": "tokenizer.json: 100%" } }, "4aeeaf4b71a04fd08a3f63f8353ba1c0": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6a8dae001f194ce5a2318ad71286fa98", "max": 17082660, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_39d616fc55da40be9df3b21f6047ebcb", "value": 17082660 } }, "fb955a70314d4ef28c53191d33c49dad": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b2b1a430abbd41bd834551289f579f55", "placeholder": "​", "style": "IPY_MODEL_f58628d63fa54e4cb3ab9e916fe1f2a1", "value": " 17.1M/17.1M [00:00<00:00, 39.0MB/s]" } }, "5ebb1ce43ecf4e609faaad857870f593": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "af724ae3920b487690b076f3f9c5b884": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c800ff7792ba4a6b989a9e048a96a1bb": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6a8dae001f194ce5a2318ad71286fa98": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "39d616fc55da40be9df3b21f6047ebcb": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "b2b1a430abbd41bd834551289f579f55": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f58628d63fa54e4cb3ab9e916fe1f2a1": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "91ab803ffddb486bac85192d8f5d3ce3": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_9d9110ab8c1b48afb6e6d026d2dd54be", "IPY_MODEL_2ee05e21a23740d391f07257116fe410", "IPY_MODEL_360d750921e24b6fb7d5232fa920921d" ], "layout": "IPY_MODEL_3e9a8d5dbbfb4fa7ba8815ba4584ad0f" } }, "9d9110ab8c1b48afb6e6d026d2dd54be": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_dfdabd84fe9e49c699f08b67b0a5f619", "placeholder": "​", "style": "IPY_MODEL_bee544c10de247a8b03372378335110e", "value": "special_tokens_map.json: 100%" } }, "2ee05e21a23740d391f07257116fe410": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6009c4551e494127811071e230b2e7be", "max": 280, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_efa7db4024d640069bfbe9976225a6db", "value": 280 } }, "360d750921e24b6fb7d5232fa920921d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_43d8e78d096146ee811b85064c08afe6", "placeholder": "​", "style": "IPY_MODEL_1d4e1988573b432493ec6c5165428aad", "value": " 280/280 [00:00<00:00, 26.1kB/s]" } }, "3e9a8d5dbbfb4fa7ba8815ba4584ad0f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "dfdabd84fe9e49c699f08b67b0a5f619": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "bee544c10de247a8b03372378335110e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "6009c4551e494127811071e230b2e7be": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "efa7db4024d640069bfbe9976225a6db": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "43d8e78d096146ee811b85064c08afe6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "1d4e1988573b432493ec6c5165428aad": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } } }, "cells": [ { "cell_type": "code", "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ], "metadata": { "id": "yrM6ZzXldMLo", "colab": { "base_uri": "https://localhost:8080/", "height": 17, "referenced_widgets": [ "21330d77a6544f1aacf16e3d6608dab5", "56221ae606894685b38769e2963c7e74", "ec4aaf7003294820a32d0cabcac0d5bc", "e9c0c88f16a94f19a9c709d9890b8e24", "876a0dfa27bb4bc885059ba98aed0a97", "e1e09a1b9df04a18ad1fbc4dd35a3c0e", "1bbdbe20067e4286825c547ddd8fd7ac", "394ccc1b0b9f4c0fbf76e8b9488da9b2", "69df7b96610f4cbfaf170db215dceae6", "dd30d3e4bb594d98a5a64d9375cf5b36", "393c9d36bda74ae5b2dbf62444070193", "32ec1e08113c49e382e3a71bf94416ed", "914fc800d68b4446a55d6468a74a111e", "d629f6e63271444c9eac198d8b943d56", "96fca2e9f7474c81a2fcf0b5d409d075", "64d20184b5d84978ac884a187d48b696", "ee421c04a5c64bc286a7dfa9cc1100e0", "bc1961f16c3545b5812bdcbf538aac5a", "c7ef5d827e264f51b0d9220c0eebaba3", "74383dba69c34509a937ed56073826be" ] }, "outputId": "9b0d3e59-32fa-4ce3-c8dd-18965b7c5e78" }, "execution_count": 1, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Count
0VIEClaude-Haiku-3.5PartialTestviệc riêng tư nhất từ cuộc sống cách đây hàng ...7231322việc riêng tư nhất từ cuộc sống cách đây hàng ...89409
1VIEGPT-4oPartialTestChốt phiên, chỉ số Dow Jones Industrial lấy lạ...9847865Chốt phiên, chỉ số Dow Jones Industrial lấy lạ...135645
2VIEAmazon-Nova-Pro-1.0PartialDevRõ ràng, Buffett, nhà đầu tư được tôn sùng nhấ...7635032Rõ ràng, Buffett, nhà đầu tư được tôn sùng nhấ...124555
3VIEClaude-Sonnet-3.5PartialTestCác triệu chứng đầu tiên xuất hiện trong vòng ...7434437Các triệu chứng đầu tiên xuất hiện trong vòng ...96437
4VIEMistral-Large-2411RewrittenTrainCổ động viên Anh Thành công ngoài kỳ vọng Dưới...60127270**Cập nhật ngày hôm sau:**\\n\\nSau trận bán kết...66305
....................................
99924VIEAmazon-Nova-Lite-1.0PartialTestBạn chớ nên bị nhập nhằng giữa sự nghe và thấu...9944352Bạn chớ nên bị nhập nhằng giữa sự nghe và thấu...155686
99925VIEGPT-o1PartialTrainÐứng về phương diện chung, không có nền văn hó...7432964Ðứng về phương diện chung , không có nền văn h...195854
99926VIEMistral-Large-2411PartialTrainBan Chăm sóc Sức khỏe Trung ương dự kiến sẽ hộ...8263803281Ban Chăm sóc Sức khỏe Trung ương dự kiến sẽ hộ...3291521
99927VIEAmazon-Nova-Pro-1.0PartialTrainXnote Z330 cấu hình thấp nhất (giá 1.500 USD) ...7131832Xnote Z330 cấu hình thấp nhất (giá 1.500 USD) ...117524
99928VIEAmazon-Nova-Pro-1.0RewrittenTest- Hằng tháng, các ngân hàng đều có báo cáo gửi...753250- Hôm qua, các ngân hàng tiếp tục gửi báo cáo ...92396
\n", "

99929 rows × 11 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", " \n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 99929,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"VIE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"GPT-o1\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Partial\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99929,\n \"samples\": [\n \"Em V\\u0169 Th\\u1ecb Nhung, l\\u1edbp 8D Tr\\u01b0\\u1eddng THCS Th\\u1ecbnh Long, t\\u1eeb 1 gi\\u1edd chi\\u1ec1u \\u0111\\u00e3 c\\u00f9ng c\\u00f4 gi\\u00e1o \\u0111i xe bu\\u00fdt qua 25 km v\\u1ec1 trung t\\u00e2m H.H\\u1ea3i H\\u1eadu \\u0111\\u1ec3 nh\\u1eadn qu\\u00e0. Nhung b\\u1ebdn l\\u1ebdn: \\u201cC\\u00f3 ti\\u1ec1n em s\\u1ebd v\\u1ec1 \\u0111\\u01b0a m\\u1eb9 ra ch\\u1ee3 mua cho b\\u1ed1 c\\u00e1i \\u00e1o kho\\u00e1c. B\\u1ed1 em b\\u1ecb m\\u00f9, t\\u1eeb 3 n\\u0103m nay ch\\u1ec9 c\\u00f3 m\\u1ed9t chi\\u1ebfc \\u00e1o kho\\u00e1c m\\u1ecfng. Em c\\u00f3 \\u00e1o \\u0111\\u1ed3ng ph\\u1ee5c r\\u1ed3i n\\u00ean kh\\u00f4ng c\\u1ea7n mua n\\u1eefa\\u201d. M\\u1eb9 l\\u00e0m ru\\u1ed9ng, b\\u1ed1 b\\u1ecb m\\u00f9, gia \\u0111\\u00ecnh r\\u1ea5t kh\\u00f3 kh\\u0103n nh\\u01b0ng h\\u1eb1ng ng\\u00e0y Nhung v\\u1eeba gi\\u00fap m\\u1eb9 ra \\u0111\\u1ed3ng v\\u1eeba t\\u00edch c\\u1ef1c h\\u1ecdc t\\u1eadp \\u0111\\u1ec3 \\u0111\\u1ea1t danh hi\\u1ec7u h\\u1ecdc sinh ti\\u00ean ti\\u1ebfn.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 720,\n \"min\": 7,\n \"max\": 25320,\n \"num_unique_values\": 3451,\n \"samples\": [\n 1919\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3281,\n \"min\": 33,\n \"max\": 112944,\n \"num_unique_values\": 9863,\n \"samples\": [\n 11035\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 421,\n \"min\": 0,\n \"max\": 18351,\n \"num_unique_values\": 2398,\n \"samples\": [\n 224\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 99909,\n \"samples\": [\n \"T\\u01b0\\u01a1ng t\\u1ef1, ch\\u1ecb Mai H\\u01b0\\u01a1ng, ph\\u1ee5 huynh c\\u00f3 con h\\u1ecdc t\\u1ea1i Tr\\u01b0\\u1eddng m\\u1ea7m non C\\u00f4 Giang (Q.1) c\\u0169ng do d\\u1ef1 tr\\u01b0\\u1edbc nhi\\u1ec1u ch\\u01b0\\u01a1ng tr\\u00ecnh T\\u01b0\\u01a1ng t\\u1ef1, ch\\u1ecb Mai H\\u01b0\\u01a1ng, ph\\u1ee5 huynh c\\u00f3 con h\\u1ecdc t\\u1ea1i Tr\\u01b0\\u1eddng m\\u1ea7m non C\\u00f4 Giang (Q.1) c\\u0169ng do d\\u1ef1 tr\\u01b0\\u1edbc nhi\\u1ec1u ch\\u01b0\\u01a1ng tr\\u00ecnh **h\\u1ecdc th\\u00eam \\u0111\\u01b0\\u1ee3c qu\\u1ea3ng c\\u00e1o r\\u1ea7m r\\u1ed9. Ch\\u1ecb chia s\\u1ebb: \\\"T\\u00f4i th\\u1ea5y nhi\\u1ec1u trung t\\u00e2m qu\\u1ea3ng c\\u00e1o ch\\u01b0\\u01a1ng tr\\u00ecnh n\\u00e0y n\\u1ecd r\\u1ea5t h\\u1ea5p d\\u1eabn, nh\\u01b0ng kh\\u00f4ng bi\\u1ebft ch\\u01b0\\u01a1ng tr\\u00ecnh n\\u00e0o th\\u1ef1c s\\u1ef1 ph\\u00f9 h\\u1ee3p v\\u1edbi con m\\u00ecnh. H\\u01a1n n\\u1eefa, l\\u1ecbch h\\u1ecdc c\\u1ee7a b\\u00e9 \\u0111\\u00e3 k\\u00edn m\\u00edt r\\u1ed3i, th\\u00eam v\\u00e0o n\\u1eefa s\\u1ee3 b\\u00e9 l\\u1ea1i b\\u1ecb qu\\u00e1 t\\u1ea3i.\\\" Ch\\u1ecb H\\u01b0\\u01a1ng c\\u0169ng b\\u00e0y t\\u1ecf lo l\\u1eafng v\\u1ec1 ch\\u1ea5t l\\u01b0\\u1ee3ng gi\\u1ea3ng d\\u1ea1y v\\u00e0 chi ph\\u00ed kh\\u00e1 cao c\\u1ee7a c\\u00e1c l\\u1edbp h\\u1ecdc th\\u00eam n\\u00e0y.**\\n\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 429,\n \"min\": 1,\n \"max\": 18582,\n \"num_unique_values\": 2495,\n \"samples\": [\n 2956\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1948,\n \"min\": 6,\n \"max\": 81494,\n \"num_unique_values\": 7179,\n \"samples\": [\n 2306\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 2 } ], "source": [ "import pandas as pd\n", "splits = {'Arabic': 'Data-v3.1/ARA-v3-1.csv', 'Chinese': 'Data-v3.1/ZHO-v3-1.csv', 'Czech': 'Data-v3.1/CES-v3-1.csv', 'Dutch': 'Data-v3.1/NLD-v3-1.csv', 'English': 'Data-v3.1/ENG-v3-1.csv', 'French': 'Data-v3.1/FRA-v3-1.csv', 'German': 'Data-v3.1/DEU-v3-1.csv', 'Greek': 'Data-v3.1/ELL-v3-1.csv', 'Hebrew': 'Data-v3.1/HEB-v3-1.csv', 'Hindi': 'Data-v3.1/HIN-v3-1.csv', 'Indonesian': 'Data-v3.1/IND-v3-1.csv', 'Italian': 'Data-v3.1/ITA-v3-1.csv', 'Japanese': 'Data-v3.1/JPN-v3-1.csv', 'Korean': 'Data-v3.1/KOR-v3-1.csv', 'Persian': 'Data-v3.1/PES-v3-1.csv', 'Polish': 'Data-v3.1/POL-v3-1.csv', 'Portuguese': 'Data-v3.1/POR-v3-1.csv', 'Romanian': 'Data-v3.1/RON-v3-1.csv', 'Russian': 'Data-v3.1/RUS-v3-1.csv', 'Spanish': 'Data-v3.1/SPA-v3-1.csv', 'Turkish': 'Data-v3.1/TUR-v3-1.csv', 'Vietnamese': 'Data-v3.1/VIE-v3-1.csv', 'Ukrainian': 'Data-v3.1/UKR-v3-1.csv'}\n", "df = pd.read_csv(\"hf://datasets/1024m/mMGTD-Corpus/\" + splits[\"Vietnamese\"])\n", "df" ] }, { "cell_type": "code", "source": [ "df = df.sample(frac=1).reset_index(drop=True)" ], "metadata": { "id": "KIgwx1iCpC3f" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "df_train = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Train')]\n", "df_dev = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Dev')]\n", "df_test = df[(df['Type'].isin(['Partial', 'Rewritten', 'Unchanged'])) & (df['Data Split'] == 'Test')]\n", "print(len(df_train))\n", "print(len(df_dev))\n", "print(len(df_test))" ], "metadata": { "id": "cVKBbVG9qDGF", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "9df35587-0b7b-4da9-84e2-48638d36845f" }, "execution_count": 4, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "39973\n", "9993\n", "49963\n" ] } ] }, { "cell_type": "code", "source": [ "VIE_train = df_train.copy()\n", "VIE_dev = df_dev.copy()\n", "VIE_test = df_test.copy()" ], "metadata": { "id": "1QWJPFozqFUh" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "df_train['id'] = 'VIE' + df_train.index.astype(str) # Creating the 'id' column\n", "df_train = df_train.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_dev['id'] = 'VIE' + df_dev.index.astype(str) # Creating the 'id' column\n", "df_dev = df_dev.rename(columns={'Modified text': 'text', 'Split Location': 'label'})\n", "df_test['id'] = 'VIE' + df_test.index.astype(str) # Creating the 'id' column\n", "df_test = df_test.rename(columns={'Modified text': 'text', 'Split Location': 'label'})" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bSlHXAnzqHmd", "outputId": "22bf5e6b-4a1a-4088-aa8b-5e62fc38bd74" }, "execution_count": 6, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_train['id'] = 'VIE' + df_train.index.astype(str) # Creating the 'id' column\n", ":3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_dev['id'] = 'VIE' + df_dev.index.astype(str) # Creating the 'id' column\n", ":5: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_test['id'] = 'VIE' + df_test.index.astype(str) # Creating the 'id' column\n" ] } ] }, { "cell_type": "code", "source": [ "df_train = pd.concat([df_train, df_dev], ignore_index=True)" ], "metadata": { "id": "aGvboB0ZqJ8M" }, "execution_count": 7, "outputs": [] }, { "cell_type": "code", "source": [ "print(len(df_train))\n", "print(len(df_dev))\n", "print(len(df_test))" ], "metadata": { "id": "qIVYeup9qM5X", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "97bbd6f7-c4a0-4039-dd21-3cc899923849" }, "execution_count": 8, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "49966\n", "9993\n", "49963\n" ] } ] }, { "cell_type": "code", "source": [ "df_train.to_json('VIE_train.jsonl', orient='records', lines=True)\n", "df_test.to_json('VIE_test.jsonl', orient='records', lines=True)" ], "metadata": { "id": "9javNVKDqO1j" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "!pip install torch\n", "!pip install transformers\n", "!pip install accelerate -U\n", "!pip install tqdm\n", "!pip install pytorch-crf\n", "!pip install sentencepiece" ], "metadata": { "id": "C6wCkGRXqQpc", "collapsed": true, "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "ed19d3a6-0fdf-44ea-e5ca-03e6bc71c43d" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: torch in /usr/local/lib/python3.11/dist-packages (2.5.1+cu124)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from torch) (3.17.0)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.11/dist-packages (from torch) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.5)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch) (2024.10.0)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)\n", " Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)\n", " Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)\n", " Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)\n", " Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-curand-cu12==10.3.5.147 (from torch)\n", " Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)\n", " Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)\n", " Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch) (2.21.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch) (12.4.127)\n", "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)\n", " Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch) (3.1.0)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch) (3.0.2)\n", "Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m105.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m86.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m57.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m49.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: nvidia-nvjitlink-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12\n", " Attempting uninstall: nvidia-nvjitlink-cu12\n", " Found existing installation: nvidia-nvjitlink-cu12 12.5.82\n", " Uninstalling nvidia-nvjitlink-cu12-12.5.82:\n", " Successfully uninstalled nvidia-nvjitlink-cu12-12.5.82\n", " Attempting uninstall: nvidia-curand-cu12\n", " Found existing installation: nvidia-curand-cu12 10.3.6.82\n", " Uninstalling nvidia-curand-cu12-10.3.6.82:\n", " Successfully uninstalled nvidia-curand-cu12-10.3.6.82\n", " Attempting uninstall: nvidia-cufft-cu12\n", " Found existing installation: nvidia-cufft-cu12 11.2.3.61\n", " Uninstalling nvidia-cufft-cu12-11.2.3.61:\n", " Successfully uninstalled nvidia-cufft-cu12-11.2.3.61\n", " Attempting uninstall: nvidia-cuda-runtime-cu12\n", " Found existing installation: nvidia-cuda-runtime-cu12 12.5.82\n", " Uninstalling nvidia-cuda-runtime-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82\n", " Attempting uninstall: nvidia-cuda-nvrtc-cu12\n", " Found existing installation: nvidia-cuda-nvrtc-cu12 12.5.82\n", " Uninstalling nvidia-cuda-nvrtc-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.5.82\n", " Attempting uninstall: nvidia-cuda-cupti-cu12\n", " Found existing installation: nvidia-cuda-cupti-cu12 12.5.82\n", " Uninstalling nvidia-cuda-cupti-cu12-12.5.82:\n", " Successfully uninstalled nvidia-cuda-cupti-cu12-12.5.82\n", " Attempting uninstall: nvidia-cublas-cu12\n", " Found existing installation: nvidia-cublas-cu12 12.5.3.2\n", " Uninstalling nvidia-cublas-cu12-12.5.3.2:\n", " Successfully uninstalled nvidia-cublas-cu12-12.5.3.2\n", " Attempting uninstall: nvidia-cusparse-cu12\n", " Found existing installation: nvidia-cusparse-cu12 12.5.1.3\n", " Uninstalling nvidia-cusparse-cu12-12.5.1.3:\n", " Successfully uninstalled nvidia-cusparse-cu12-12.5.1.3\n", " Attempting uninstall: nvidia-cudnn-cu12\n", " Found existing installation: nvidia-cudnn-cu12 9.3.0.75\n", " Uninstalling nvidia-cudnn-cu12-9.3.0.75:\n", " Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75\n", " Attempting uninstall: nvidia-cusolver-cu12\n", " Found existing installation: nvidia-cusolver-cu12 11.6.3.83\n", " Uninstalling nvidia-cusolver-cu12-11.6.3.83:\n", " Successfully uninstalled nvidia-cusolver-cu12-11.6.3.83\n", "Successfully installed nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nvjitlink-cu12-12.4.127\n", "Requirement already satisfied: transformers in /usr/local/lib/python3.11/dist-packages (4.48.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers) (3.17.0)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.28.1)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers) (24.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (6.0.2)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers) (2024.11.6)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers) (2.32.3)\n", "Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.21.0)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers) (0.5.2)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers) (4.67.1)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (2024.10.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.24.0->transformers) (4.12.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers) (2025.1.31)\n", "Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (1.3.0)\n", "Requirement already satisfied: numpy<3.0.0,>=1.17 in /usr/local/lib/python3.11/dist-packages (from accelerate) (1.26.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (24.2)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from accelerate) (5.9.5)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from accelerate) (6.0.2)\n", "Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (2.5.1+cu124)\n", "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.28.1)\n", "Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from accelerate) (0.5.2)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (3.17.0)\n", "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2024.10.0)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)\n", "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.67.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.21.0->accelerate) (4.12.2)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.4.2)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.1.5)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (9.1.0.70)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.5.8)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (11.2.1.3)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (10.3.5.147)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (11.6.1.9)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.3.1.170)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (2.21.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (12.4.127)\n", "Requirement already satisfied: triton==3.1.0 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (3.1.0)\n", "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.11/dist-packages (from torch>=2.0.0->accelerate) (1.13.1)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch>=2.0.0->accelerate) (1.3.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch>=2.0.0->accelerate) (3.0.2)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.4.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.3.0)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2025.1.31)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (4.67.1)\n", "Collecting pytorch-crf\n", " Downloading pytorch_crf-0.7.2-py3-none-any.whl.metadata (2.4 kB)\n", "Downloading pytorch_crf-0.7.2-py3-none-any.whl (9.5 kB)\n", "Installing collected packages: pytorch-crf\n", "Successfully installed pytorch-crf-0.7.2\n", "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.11/dist-packages (0.2.0)\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "os.makedirs(\"./runs/exp_seed\", exist_ok=True)\n", "os.makedirs(\"./runs/exp_seed/logs\", exist_ok=True)\n", "os.makedirs(\"./runs/exp_seed/xlmlongformerbase\", exist_ok=True)" ], "metadata": { "id": "if7zZ-egqSrE" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "import torch\n", "import json\n", "from transformers import AutoTokenizer, AutoModelForTokenClassification\n", "from transformers.trainer_callback import TrainerState\n", "from torch.utils.data import Dataset, DataLoader\n", "from torch.nn.utils.rnn import pad_sequence\n", "import transformers\n", "from torch import nn\n", "from transformers import AutoModel, AutoConfig\n", "from torchcrf import CRF\n", "from torch.cuda.amp import autocast\n", "from transformers import Trainer\n", "from tqdm import tqdm\n", "import numpy as np\n", "import logging\n", "import glob\n", "from tqdm import tqdm\n", "from dataclasses import dataclass, field\n", "logging.basicConfig(level=logging.INFO)\n", "logger = logging.getLogger()\n", "@dataclass\n", "class ModelConfig:\n", " model_path = \"hyperonym/xlm-roberta-longformer-base-16384\"\n", " model_checkpoint_dir = \"./runs\"\n", "@dataclass\n", "class DatasetConfig:\n", " train_file = \"/content/VIE_train.jsonl\"\n", " test_files = [\"/content/VIE_test.jsonl\"]\n", "@dataclass\n", "class TrainingArgsConfig:\n", " do_train = False\n", " do_predict = False\n", " seed = 1024\n", " output_dir = \"./runs/exp_seed\"\n", " logging_steps = 160\n", " logging_dir = \"./runs/exp_seed\"\n", " num_train_epochs = 30\n", " per_device_train_batch_size = 12\n", " per_device_eval_batch_size = 12\n", " max_length = 2048\n", "model_args = ModelConfig()\n", "data_args = DatasetConfig()\n", "training_args = TrainingArgsConfig()\n", "class CRFTrainer(Trainer):\n", " def __init__(self, *args, **kwargs):\n", " super().__init__(*args, **kwargs)\n", " def compute_loss(self, model, inputs, return_outputs=False):\n", " print(inputs.keys())\n", " labels = inputs.pop(\"labels\")\n", " outputs = model(**inputs)\n", " emissions = outputs[0]\n", " mask = inputs[\"attention_mask\"]\n", " crf_loss = -model.crf(emissions, labels, mask=mask)\n", " return crf_loss\n", " def training_step(self, model, inputs):\n", " loss = self.compute_loss(model, inputs)\n", " return {\"loss\": loss, \"inputs\": inputs}\n", "class AutoModelCRF(nn.Module):\n", " def __init__(self, model_name_or_path, dropout=0.075):\n", " super(AutoModelCRF, self).__init__()\n", " self.config = AutoConfig.from_pretrained(model_name_or_path)\n", " self.num_labels = 2\n", " self.encoder = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, config=self.config, from_tf=True)\n", " self.dropout = nn.Dropout(dropout)\n", " self.linear = nn.Linear(self.config.hidden_size, self.num_labels)\n", " self.crf = CRF(self.num_labels, batch_first=True)\n", " def forward(self, input_ids, attention_mask, labels=None):\n", " inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}\n", " outputs = self.encoder(**inputs)\n", " seq_output = outputs[0]\n", " seq_output = self.dropout(seq_output)\n", " emission = self.linear(seq_output)\n", " if labels is None:\n", " tags = self.crf.decode(emission, attention_mask.byte())\n", " tags_padded = []\n", " for idx, sequence in enumerate(tags):\n", " if len(attention_mask[idx]) > len(sequence):\n", " tag_padded = sequence + [sequence[-1]]*(len(attention_mask[idx])-len(sequence))\n", " else:\n", " tag_padded = sequence\n", " tags_padded.append(tag_padded)\n", " out = np.array(tags_padded)\n", " return out\n", " else:\n", " crf_loss = -self.crf(emission, labels, mask=attention_mask.byte())\n", " return crf_loss\n", "def evaluate_position_difference(actual_position, predicted_position):\n", " return abs(actual_position - predicted_position)\n", "def get_start_position(sequence, mapping=None, token_level=True):\n", " if mapping is not None:\n", " mask = mapping != -100\n", " sequence = sequence[mask]\n", " mapping = mapping[mask]\n", " change_indices = np.where(np.diff(sequence) == 1)[0]\n", " if len(change_indices) > 0:\n", " value = change_indices[0] + 1\n", " else:\n", " value = 0 if sequence[0] == 1 else len(sequence) - 1\n", " if not token_level:\n", " value = mapping[value] if mapping is not None else value\n", " return value\n", "def evaluate_machine_start_position(labels, predictions, idx2word=None, token_level=False):\n", " actual_starts = []\n", " predicted_starts = []\n", " if not token_level and idx2word is None:\n", " raise ValueError(\"idx2word must be provided if evaluation is at word level (token_level=False)\")\n", " for idx in range(labels.shape[0]):\n", " predict, label, mapping = (predictions[idx][1:len(labels[idx])], labels[idx][1:len(labels[idx])], idx2word[idx][1:len(labels[idx])] if not token_level else None,)\n", " predicted_value = get_start_position(predict, mapping, token_level)\n", " actual_value = get_start_position(label, mapping, token_level)\n", " predicted_starts.append(predicted_value)\n", " actual_starts.append(actual_value)\n", " position_differences = [ evaluate_position_difference(actual, predict) for actual, predict in zip(actual_starts, predicted_starts) ]\n", " mean_position_difference = np.mean(position_differences)\n", " return mean_position_difference\n", "def compute_metrics(p):\n", " pred, labels = p\n", " mean_absolute_diff = evaluate_machine_start_position(labels, pred, token_level=True)\n", " return {\"mean_absolute_diff\": mean_absolute_diff,}\n", "def training_loop(model, optimizer, train_dataloader, device):\n", " model.train()\n", " total_loss = 0\n", " for step, batch in enumerate(tqdm(train_dataloader)):\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " labels = batch[\"labels\"].to(device)\n", " optimizer.zero_grad()\n", " loss = model(input_ids, attention_mask, labels=labels)\n", " loss.backward()\n", " optimizer.step()\n", " logger.info(f\"Step {step}: {loss.item():.4f}\")\n", " total_loss += loss.item()\n", " avg_loss = total_loss/len(train_dataloader)\n", " print(f\"Training loss: {avg_loss:.4f}\")\n", "def predict(model, test_dataloader, device):\n", " all_preds = []\n", " with torch.no_grad():\n", " for batch in tqdm(test_dataloader):\n", " input_ids = batch[\"input_ids\"].to(device)\n", " attention_mask = batch[\"attention_mask\"].to(device)\n", " preds = model(input_ids, attention_mask)\n", " all_preds.extend(preds)\n", " out = np.array(all_preds)\n", " print(out.shape)\n", " return out\n", "def save_model(model_name, model, optimizer, epoch, output_dir): # train_mae, val_mae,\n", " if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)\n", " checkpoint = {'model_state_dict': model.state_dict(),'optimizer_state_dict': optimizer.state_dict()} # 'train_mae': train_mae,'val_mae': val_mae,\n", " model_name = model_name.replace(\"/\", \"-\")\n", " file_path = os.path.join(output_dir, f\"{model_name}-epoch-{epoch}.pt\")\n", " print(file_path)\n", " torch.save(checkpoint, file_path)\n", " logger.info(f\"Model has been saved successfully to {file_path}\")\n", "class Semeval_Data(torch.utils.data.Dataset):\n", " def __init__(self, data_path, model_name, max_length=512, inference=False, debug=False):\n", " with open(data_path, \"r\") as f:\n", " self.data = [json.loads(line) for line in f]\n", " self.inference = inference\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " self.max_length = max_length\n", " self.debug = debug\n", " def __len__(self):\n", " return len(self.data)\n", " def __getitem__(self, idx):\n", " text = self.data[idx][\"text\"]\n", " id = self.data[idx][\"id\"]\n", " label = None\n", " labels_available = \"label\" in self.data[idx]\n", " if labels_available:\n", " label = self.data[idx][\"label\"]\n", " labels = []\n", " corresponding_word = []\n", " tokens = []\n", " input_ids = []\n", " attention_mask = []\n", " for jdx, word in enumerate(text.split(\" \")):\n", " word_encoded = self.tokenizer.tokenize(word)\n", " sub_words = len(word_encoded)\n", " if labels_available:\n", " is_machine_text = 1 if jdx >= label else 0\n", " labels.extend([is_machine_text] * sub_words)\n", " corresponding_word.extend([jdx] * sub_words)\n", " tokens.extend(word_encoded)\n", " input_ids.extend(self.tokenizer.convert_tokens_to_ids(word_encoded))\n", " attention_mask.extend([1] * sub_words)\n", " if len(input_ids) < self.max_length - 2:\n", " input_ids = ( [0] + input_ids + [2] + [1] * (self.max_length - len(input_ids) - 2) )\n", " if labels_available:\n", " labels = [0] + labels + [labels[-1]] * (self.max_length - len(labels) - 1)\n", " attention_mask = ( [1] + attention_mask + [1] + [0] * (self.max_length - len(attention_mask) - 2) )\n", " corresponding_word = ( [-100] + corresponding_word + [-100] * (self.max_length - len(corresponding_word) - 1) )\n", " tokens = ( [\"\"] + tokens + [\"\"] + [\"\"] * (self.max_length - len(tokens) - 2) )\n", " else:\n", " input_ids = [0] + input_ids[: self.max_length - 2] + [2]\n", " if labels_available:\n", " labels = [0] + labels[: self.max_length - 2] + [labels[self.max_length - 3]]\n", " corresponding_word = ( [-100] + corresponding_word[: self.max_length - 2] + [-100] )\n", " attention_mask = [1] + attention_mask[: self.max_length - 2] + [1]\n", " tokens = [\"\"] + tokens[: self.max_length - 2] + [\"\"]\n", " encoded = {}\n", " if labels_available:\n", " encoded[\"labels\"] = torch.tensor(labels)\n", " encoded[\"input_ids\"] = torch.tensor(input_ids)\n", " encoded[\"attention_mask\"] = torch.tensor(attention_mask)\n", " if labels_available:\n", " assert encoded[\"input_ids\"].shape == encoded[\"labels\"].shape\n", " if self.debug and not self.inference:\n", " encoded[\"partial_human_review\"] = \" \".join(text.split(\" \")[:label])\n", " if self.inference:\n", " encoded[\"text\"] = text\n", " encoded[\"id\"] = id\n", " encoded[\"corresponding_word\"] = corresponding_word\n", " return encoded\n", "if __name__ == \"__main__\":\n", " model_args = ModelConfig()\n", " data_args = DatasetConfig()\n", " training_args = TrainingArgsConfig()\n", " transformers.set_seed(training_args.seed)\n", " model_path = model_args.model_path\n", " model_checkpoint_dir = model_args.model_checkpoint_dir\n", " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " model = AutoModelCRF(model_path).to(device)\n", " optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)\n", " train_set = Semeval_Data(data_args.train_file, model_path, max_length=training_args.max_length)\n", " train_dataloader = DataLoader(train_set, batch_size=training_args.per_device_train_batch_size, shuffle=True)\n", " train_eval_dataloader = DataLoader(train_set, batch_size=training_args.per_device_eval_batch_size, shuffle=False)\n", " if training_args.do_train:\n", " logger.info(\"Training...\")\n", " logger.info(\"*** Train Dataset ***\")\n", " logger.info(f\"Number of samples: {len(train_set)}\")\n", " num_train_epochs = training_args.num_train_epochs\n", " for epoch in tqdm(range(num_train_epochs)):\n", " training_loop(model, optimizer, train_dataloader, device)\n", " save_model(model_path, model, optimizer, epoch, model_checkpoint_dir) # ,train_mse ,val_mse" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 301, "referenced_widgets": [ "46b3179893e04e5ab4eb6e0967848b29", "b521ac3bdb5e4c7ea3841ab17058e0d4", "bbdbb4bcd9bb4dd9a07d13eab2e98c8b", "f33364ef9aed41708d0e299f65908c28", "5d57450f4992496cbc1ba1397388e7d0", "ada58e7c96ba4749a87bd7520255fefa", "88055b608df2435182e374b1e487c4ff", "69ed06e493184af08fe9a85b36939914", "4d3cf3fb6a8848efb8bb5a4a113ff95e", "4ad2c43f06fb47e48dd2bfa66cc63480", "2713a4657b0d4e38a33ba96a782e3674", "aec339a98bb44b8faa5f44dd0564fea0", "f86c420a0a134d468c07fd2329c579d3", "d23251854d8b4dddb5aeab7f017b503d", "60a889e56e774a2f9b04a1abeeac5729", "92c1f3a6e5da40bfba6524de99428481", "230b7df7576441a5bbe91022e219327c", "639900294423423aa45c7697704855c7", "297b487addb841c785eb372691163121", "62a7c80684444f88bc1ec110fc99c197", "2f5a2dafd58d45db84cfb73d4c11487e", "e01e1ee0c58945fb82f124ff1537d148", "f381116834034fa3a2ecca34096571e6", "aea561053459432eb93b6761f8260893", "b76009f6e47d473f809bd3f79d04843d", "8b9ef738e7e54c409365ac4407164de0", "e9a64c46f4d5418080b28ffcf7b7e7a6", "8f6b13c8b0f044c7a255c7834e4c5cd9", "9837ccc7a7174fffa25ce2239874fc98", "97d7ba4acf2f487db73d73aeddbca24b", "a2509d0e6f874f029efbc29058ad20bd", "c476b6a4c90b4c35afc776bc2cecef0b", "2bbbafc9bbdf49f8a24db7ac3ce4207d", "cc903da67549457687d7ee2a9db16f4d", "909c9d1828574fedb91055afba368bf4", "4aeeaf4b71a04fd08a3f63f8353ba1c0", "fb955a70314d4ef28c53191d33c49dad", "5ebb1ce43ecf4e609faaad857870f593", "af724ae3920b487690b076f3f9c5b884", "c800ff7792ba4a6b989a9e048a96a1bb", "6a8dae001f194ce5a2318ad71286fa98", "39d616fc55da40be9df3b21f6047ebcb", "b2b1a430abbd41bd834551289f579f55", "f58628d63fa54e4cb3ab9e916fe1f2a1", "91ab803ffddb486bac85192d8f5d3ce3", "9d9110ab8c1b48afb6e6d026d2dd54be", "2ee05e21a23740d391f07257116fe410", "360d750921e24b6fb7d5232fa920921d", "3e9a8d5dbbfb4fa7ba8815ba4584ad0f", "dfdabd84fe9e49c699f08b67b0a5f619", "bee544c10de247a8b03372378335110e", "6009c4551e494127811071e230b2e7be", "efa7db4024d640069bfbe9976225a6db", "43d8e78d096146ee811b85064c08afe6", "1d4e1988573b432493ec6c5165428aad" ] }, "id": "tXBLrJp0quLE", "outputId": "6c68a88c-6ee2-4784-fd1d-ee71873e89c9" }, "execution_count": 12, "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "config.json: 0%| | 0.00/772 [00:00] 3.32G 39.9MB/s in 85s \n", "\n", "2025-02-13 04:28:54 (40.0 MB/s) - ‘VIE-xlm-longformer’ saved [3563459222/3563459222]\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "model = AutoModelCRF(model_args.model_path).to(device)\n", "tokenizer = AutoTokenizer.from_pretrained(model_args.model_path)\n", "checkpoint = torch.load('VIE-xlm-longformer')\n", "model.load_state_dict(checkpoint['model_state_dict'])\n", "model.eval()\n", "test_sets = []\n", "for test_file in data_args.test_files:\n", " test_set = Semeval_Data(test_file, model_args.model_path, max_length=training_args.max_length, inference=True)\n", " test_dataloader = DataLoader(test_set, batch_size=training_args.per_device_eval_batch_size, shuffle=False)\n", " test_sets.append(test_dataloader)\n", "logger.info(\"Predicting...\")\n", "logger.info(\"*** Test Datasets ***\")\n", "logger.info(f\"Number of sets: {len(test_sets)}\")\n", "for idx, test_set in enumerate(test_sets):\n", " logger.info(f\"Test Dataset {idx + 1}\")\n", " logger.info(f\"Number of samples: {len(test_set)}\")\n", " predictions = predict(model, test_set, device)\n", " corresponding_words = []\n", " ids = []\n", " for batch in test_set:\n", " corr_word_tensors = [torch.tensor(cw) for cw in batch['corresponding_word']]\n", " corr_word_padded = torch.nn.utils.rnn.pad_sequence(corr_word_tensors, batch_first=True, padding_value=-100)\n", " corr_word = np.transpose(corr_word_padded.numpy(), (1, 0))\n", " ids.extend(batch[\"id\"])\n", " corresponding_words.extend(corr_word)\n", " corresponding_words = np.array(corresponding_words)\n", " logger.info(\"Predictions completed!\")\n", " df_ids = []\n", " df_labels = []\n", " for id, pred, corr_word in zip(ids, predictions, corresponding_words):\n", " df_ids.append(id)\n", " df_labels.append(get_start_position(pred, corr_word, token_level=False))\n", " df = pd.DataFrame({\"id\": df_ids, \"label\": df_labels})\n", " file_name = os.path.basename(test_file)\n", " file_dirs = os.path.join(training_args.output_dir, \"predictions\")\n", " os.makedirs(file_dirs, exist_ok=True)\n", " file_path = os.path.join(file_dirs, file_name)\n", " records = df.to_dict(\"records\")\n", " with open(file_path, \"w\") as f:\n", " for record in records:\n", " f.write(json.dumps(record) + \"\\n\")" ], "metadata": { "id": "fIMLFzDxrVSA", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "d41732fe-27bd-46f1-bf0b-24ad7c9a3fab" }, "execution_count": 14, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/tf_keras/src/initializers/initializers.py:121: UserWarning: The initializer TruncatedNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initializer instance more than once.\n", " warnings.warn(\n", "All TF 2.0 model weights were used when initializing LongformerModel.\n", "\n", "All the weights of LongformerModel were initialized from the TF 2.0 model.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use LongformerModel for predictions without further training.\n", ":4: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", " checkpoint = torch.load('VIE-xlm-longformer')\n", " 0%| | 0/4164 [00:00:22: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " corr_word_tensors = [torch.tensor(cw) for cw in batch['corresponding_word']]\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install jsonlines\n", "import pandas as pd\n", "import jsonlines\n", "jsonl_file_path = '/content/runs/exp_seed/predictions/VIE_test.jsonl'\n", "def display_jsonl_as_dataframe(file_path):\n", " data = []\n", " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " data.append(obj)\n", " df = pd.DataFrame(data)\n", " return df\n", "jsonl_df = display_jsonl_as_dataframe(jsonl_file_path)\n", "jsonl_df" ], "metadata": { "id": "yutpCG-Drcjn", "colab": { "base_uri": "https://localhost:8080/", "height": 527 }, "outputId": "d92be248-b4c6-49db-b8a8-7621b1a33562" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting jsonlines\n", " Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)\n", "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.11/dist-packages (from jsonlines) (25.1.0)\n", "Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", "Installing collected packages: jsonlines\n", "Successfully installed jsonlines-4.0.0\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " id label\n", "0 VIE2 0\n", "1 VIE4 0\n", "2 VIE5 201\n", "3 VIE7 789\n", "4 VIE11 0\n", "... ... ...\n", "49958 VIE99923 20\n", "49959 VIE99924 39\n", "49960 VIE99925 36\n", "49961 VIE99927 431\n", "49962 VIE99928 140\n", "\n", "[49963 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel
0VIE20
1VIE40
2VIE5201
3VIE7789
4VIE110
.........
49958VIE9992320
49959VIE9992439
49960VIE9992536
49961VIE99927431
49962VIE99928140
\n", "

49963 rows × 2 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "jsonl_df", "summary": "{\n \"name\": \"jsonl_df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"VIE25022\",\n \"VIE45113\",\n \"VIE92676\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123,\n 1487,\n 1362\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 15 } ] }, { "cell_type": "code", "source": [ "jsonl_file_path = '/content/VIE_test.jsonl'\n", "def display_jsonl_as_dataframe(file_path):\n", " data = []\n", " with jsonlines.open(file_path) as reader:\n", " for obj in reader:\n", " data.append(obj)\n", " df = pd.DataFrame(data)\n", " return df\n", "jsonl_df_gold = display_jsonl_as_dataframe(jsonl_file_path)\n", "jsonl_df_gold" ], "metadata": { "id": "nLm2KGliriEN", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "cbea6a1a-fdfb-4206-ccf1-24eec1565049" }, "execution_count": 16, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count label \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " text New Word Count \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " New Char Count id \n", "0 292 VIE2 \n", "1 407 VIE4 \n", "2 1264 VIE5 \n", "3 3916 VIE7 \n", "4 369 VIE11 \n", "... ... ... \n", "49958 298 VIE99923 \n", "49959 695 VIE99924 \n", "49960 657 VIE99925 \n", "49961 2535 VIE99927 \n", "49962 1116 VIE99928 \n", "\n", "[49963 rows x 12 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountlabeltextNew Word CountNew Char Countid
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...68292VIE2
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...90407VIE4
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...2821264VIE5
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...8473916VIE7
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83369VIE11
.......................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...69298VIE99923
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...153695VIE99924
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...144657VIE99925
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...5622535VIE99927
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...2081116VIE99928
\n", "

49963 rows × 12 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "jsonl_df_gold", "summary": "{\n \"name\": \"jsonl_df_gold\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"VIE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Gemini-Pro-1.5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Rewritten\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"B\\u1ea1n l\\u00e0 \\u1ee9ng vi\\u00ean ch\\u01b0a c\\u00f3 kinh nghi\\u1ec7m l\\u00e0m vi\\u1ec7c cho n\\u00ean nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng th\\u01b0\\u1eddng h\\u1ecfi nh\\u1eefng ki\\u1ebfn th\\u1ee9c b\\u1ea1n \\u0111\\u00e3 \\u0111\\u01b0\\u1ee3c \\u0111\\u00e0o t\\u1ea1o v\\u00e0 s\\u1ef1 v\\u1eadn d\\u1ee5ng c\\u00e1c ki\\u1ebfn th\\u1ee9c \\u0111\\u00f3 v\\u00e0o trong th\\u1ef1c t\\u1ebf th\\u00f4ng qua c\\u00e1c t\\u00ecnh hu\\u1ed1ng th\\u1ef1c t\\u1ebf. B\\u00ean c\\u1ea1nh \\u0111\\u00f3, nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng s\\u1ebd ph\\u1ecfng v\\u1ea5n b\\u1ea1n \\u0111\\u1ec3 th\\u1ea5y \\u0111\\u01b0\\u1ee3c nh\\u1eefng kh\\u1ea3 n\\u0103ng \\u1ee9ng ph\\u00f3 c\\u1ee7a b\\u1ea1n th\\u00f4ng qua c\\u00e1c c\\u00e2u h\\u1ecfi. N\\u00f3 th\\u1ec3 hi\\u1ec7n t\\u01b0 duy s\\u00e1ng t\\u1ea1o v\\u00e0 th\\u00f4ng minh trong khi tr\\u1ea3 l\\u1eddi c\\u1ee7a b\\u1ea1n.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 707,\n \"min\": 7,\n \"max\": 21413,\n \"num_unique_values\": 2810,\n \"samples\": [\n 1814\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3224,\n \"min\": 33,\n \"max\": 95406,\n \"num_unique_values\": 7520,\n \"samples\": [\n 9869\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 415,\n \"min\": 1,\n \"max\": 18582,\n \"num_unique_values\": 2011,\n \"samples\": [\n 871\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1879,\n \"min\": 6,\n \"max\": 77183,\n \"num_unique_values\": 5623,\n \"samples\": [\n 7507\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"VIE25022\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 16 } ] }, { "cell_type": "code", "source": [ "jsonl_df = jsonl_df.rename(columns={'label': 'label_pred'})\n", "jsonl_df_gold = jsonl_df_gold.rename(columns={'label': 'label_gold'})\n", "merged_df = pd.merge(jsonl_df[['id', 'label_pred']], jsonl_df_gold[['id','text','label_gold']], on='id')\n", "merged_df" ], "metadata": { "id": "wFmwSZsirsFY", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "2141aaeb-1a7b-4e5a-ddbd-48a15dfbdeda" }, "execution_count": 17, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id label_pred \\\n", "0 VIE2 0 \n", "1 VIE4 0 \n", "2 VIE5 201 \n", "3 VIE7 789 \n", "4 VIE11 0 \n", "... ... ... \n", "49958 VIE99923 20 \n", "49959 VIE99924 39 \n", "49960 VIE99925 36 \n", "49961 VIE99927 431 \n", "49962 VIE99928 140 \n", "\n", " text label_gold \n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 0 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 0 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 210 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 789 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 0 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 20 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 65 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 43 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 460 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", "[49963 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel_predtextlabel_gold
0VIE20Theo thông tin cập nhật hôm sau, bị cáo L.T đã...0
1VIE40Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...0
2VIE5201Đây là vụ tấn công thứ hai trong vòng hai tuần...210
3VIE7789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...789
4VIE110Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...0
...............
49958VIE9992320đối đãi rất tốt và không hề có mâu thuẫn trước...20
49959VIE9992439Từ biên giới Trung Quốc, mất chừng ba giờ lái ...65
49960VIE9992536Được biết, hiện nay dòng sản phẩm váy cưới dàn...43
49961VIE99927431Bà Haley: Hội đồng bảo vệ những nước vi phạm n...460
49962VIE99928140Lời cáo buộc được đưa ra giữa lúc quan h...208
\n", "

49963 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"VIE25022\",\n \"VIE45113\",\n \"VIE92676\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123,\n 1487,\n 1362\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\",\n \"H\\u1ed3ng y George Pell V\\u1ecb H\\u1ed3ng y c\\u1ea5p cao nh\\u1ea5t c\\u1ee7a Gi\\u00e1o h\\u00f4\\u0323i C\\u00f4ng gi\\u00e1o Australia n\\u00f3i \\u00f4ng \\u0111\\u00e3 b\\u1ecb 'th\\u00f3a m\\u1ea1 t\\u00ednh c\\u00e1ch' trong su\\u1ed1t cu\\u00f4\\u0323c \\u0111i\\u1ec1u tra k\\u00e9o d\\u00e0i hai n\\u0103m v\\u1ec1 nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c 'gi\\u1ea3' n\\u00e0y. \\u00d4ng cho bi\\u1ebft \\u0110\\u1ee9c Gi\\u00e1o ho\\u00e0ng \\u0111\\u00e3 cho ph\\u00e9p \\u00f4ng ngh\\u1ec9 \\u0111\\u1ec3 \\u0111\\u01b0\\u01a1ng \\u0111\\u1ea7u v\\u1edbi nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c n\\u00e0y. B\\u1ea3n \\u00e1n c\\u00f3 li\\u00ean quan t\\u1edbi nh\\u1eefng v\\u1ee5 vi\\u00ea\\u0323c \\u0111\\u01b0\\u1ee3c cho l\\u00e0 'c\\u00f3 t\\u00ednh l\\u1ecbch s\\u1eed', c\\u1ea3nh s\\u00e1t bang Victoria cho bi\\u1ebft. Vatican lo ng\\u1ea1i v\\u1ec1 gi\\u00e1m m\\u1ee5c b\\u1ecb giam t\\u1ea1i TQ Th\\u1ec9nh c\\u1ea7u Vatican 'minh x\\u00e9t' v\\u1ec1 c\\u1ef1u TGM Ki\\u1ec7t H\\u1ed3ng y Pell, n\\u0103m nay 76 tu\\u1ed5i, \\u0111ang l\\u00e0m vi\\u00ea\\u0323c t\\u1ea1i Vatican, v\\u00e0 \\u0111\\u01b0\\u1ee3c coi l\\u00e0 ch\\u1ee9c s\\u1eafc cao c\\u1ea5p th\\u1ee9 ba \\u1edf Vatican. Ph\\u00e1t bi\\u1ec3u t\\u1ea1i m\\u00f4\\u0323t cu\\u00f4\\u0323c h\\u1ecdp b\\u00e1o, H\\u1ed3ng y Pell cho bi\\u1ebft \\u00f4ng s\\u1ebd bay v\\u1ec1 Australia n\\u1ebfu c\\u00e1c b\\u00e1c s\\u0129 cho ph\\u00e9p. 'T\\u00f4i mong \\u0111\\u1ebfn ng\\u00e0y t\\u00f4i ra t\\u00f2a,' \\u00f4ng n\\u00f3i. 'T\\u00f4i kh\\u00f4ng ph\\u1ea1m nh\\u1eefng t\\u00f4\\u0323i n\\u00e0y, ch\\u00fang l\\u00e0 gi\\u1ea3. Chuy\\u00ea\\u0323n l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c l\\u00e0 gh\\u00ea t\\u1edfm \\u0111\\u1ed1i v\\u1edbi t\\u00f4i.' Gi\\u00e1o ho\\u00e0ng Francis n\\u1ed7 l\\u1ef1c c\\u1ea3i t\\u1ed5 Vatican T\\u00f2a th\\u00e1nh C\\u00f4ng gi\\u00e1o tr\\u00ean kh\\u1eafp th\\u1ebf gi\\u1edbi \\u0111\\u00e3 ph\\u1ea3i \\u0111\\u1ed1i m\\u0103\\u0323t v\\u1edbi h\\u00e0ng lo\\u1ea1t c\\u00e1o bu\\u00f4\\u0323c li\\u00ean quan \\u0111\\u1ebfn l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c do c\\u00e1c th\\u1ea7y tu g\\u00e2y ra. C\\u00f3 ngu\\u1ed3n tin n\\u00f3i r\\u1eb1ng nh\\u1eefng tr\\u01b0\\u1eddng h\\u1ee3p n\\u00e0y b\\u1ecb che \\u0111\\u00e2\\u0323y. Ph\\u00f3ng vi\\u00ean James Reynolds c\\u1ee7a BBC n\\u00f3i nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c n\\u00e0y \\u0111\\u01b0a T\\u00f2a Th\\u00e1nh v\\u00e0o m\\u00f4\\u0323t th\\u1eddi \\u0111i\\u1ec3m kh\\u00f3 kh\\u0103n. \\u0110\\u1ee9c Gi\\u00e1o ho\\u00e0ng Francis \\u0111ang n\\u1ed7 l\\u1ef1c c\\u1ea3i t\\u1ed5 Vatican v\\u00e0 n\\u00e2ng cao ti\\u00eau chu\\u1ea9n \\u0111\\u1ea1o \\u0111\\u1ee9c. Tuy nhi\\u00ean, v\\u1ee5 \\u00e1n c\\u1ee7a H\\u1ed3ng y Pell cho th\\u1ea5y nh\\u1eefng th\\u00e1ch th\\u1ee9c to l\\u1edbn m\\u00e0 Gi\\u00e1o h\\u1ed9i C\\u00f4ng gi\\u00e1o ph\\u1ea3i \\u0111\\u1ed1i m\\u1eb7t trong vi\\u1ec7c gi\\u1ea3i quy\\u1ebft v\\u1ea5n \\u0111\\u1ec1 l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c v\\u00e0 gi\\u00e0nh l\\u1ea1i l\\u00f2ng tin c\\u1ee7a c\\u00f4ng ch\\u00fang. Vi\\u1ec7c H\\u1ed3ng y Pell kh\\u1eb3ng \\u0111\\u1ecbnh s\\u1ef1 trong s\\u1ea1ch c\\u1ee7a m\\u00ecnh v\\u00e0 s\\u1eb5n s\\u00e0ng \\u0111\\u1ed1i m\\u1eb7t v\\u1edbi t\\u00f2a \\u00e1n cho th\\u1ea5y \\u00f4ng t\\u1ef1 tin v\\u00e0o s\\u1ef1 v\\u00f4 t\\u1ed9i c\\u1ee7a m\\u00ecnh. Tuy nhi\\u00ean, s\\u1ef1 vi\\u1ec7c c\\u0169ng \\u0111\\u1eb7t ra c\\u00e2u h\\u1ecfi v\\u1ec1 t\\u00ednh minh b\\u1ea1ch v\\u00e0 hi\\u1ec7u qu\\u1ea3 c\\u1ee7a c\\u00e1c cu\\u1ed9c \\u0111i\\u1ec1u tra n\\u1ed9i b\\u1ed9 trong Gi\\u00e1o h\\u1ed9i. T\\u01b0\\u01a1ng lai\",\n \"\\u201cCh\\u00e0o s\\u00e2n\\u201d b\\u1eb1ng ca kh\\u00fac c\\u00f3 giai \\u0111i\\u1ec7u nh\\u1eb9 nh\\u00e0ng \\u201cY\\u00eau th\\u01b0\\u01a1ng mong manh\\u201d, c\\u1eb7p Qu\\u00e1ch Ng\\u1ecdc Ngoan \\u2013 Ng\\u1ecdc Anh th\\u00eam m\\u1ed9t l\\u1ea7n n\\u1eefa kh\\u1eb3ng \\u0111\\u1ecbnh r\\u1eb1ng nh\\u1ea1c tr\\u1eef t\\u00ecnh m\\u1edbi l\\u00e0 s\\u1edf tr\\u01b0\\u1eddng c\\u1ee7a h\\u1ecd. Gi\\u1ecdng h\\u00e1t ng\\u1ecdt ng\\u00e0o, t\\u00ecnh c\\u1ea3m c\\u1ee7a Ng\\u1ecdc Anh h\\u00f2a quy\\u1ec7n c\\u00f9ng ch\\u1ea5t gi\\u1ecdng tr\\u1ea7m \\u1ea5m, nam t\\u00ednh c\\u1ee7a Qu\\u00e1ch Ng\\u1ecdc Ngoan \\u0111\\u00e3 t\\u1ea1o n\\u00ean m\\u1ed9t m\\u00e0n tr\\u00ecnh di\\u1ec5n \\u0111\\u1ea7y c\\u1ea3m x\\u00fac, ch\\u1ea1m \\u0111\\u1ebfn tr\\u00e1i tim kh\\u00e1n gi\\u1ea3. S\\u1ef1 k\\u1ebft h\\u1ee3p \\u0103n \\u00fd n\\u00e0y kh\\u00f4ng ch\\u1ec9 d\\u1eebng l\\u1ea1i \\u1edf gi\\u1ecdng h\\u00e1t m\\u00e0 c\\u00f2n th\\u1ec3 hi\\u1ec7n qua \\u00e1nh m\\u1eaft, c\\u1eed ch\\u1ec9 t\\u00ecnh t\\u1ee9 tr\\u00ean s\\u00e2n kh\\u1ea5u. D\\u01b0\\u1eddng nh\\u01b0 h\\u1ecd \\u0111ang k\\u1ec3 m\\u1ed9t c\\u00e2u chuy\\u1ec7n t\\u00ecnh y\\u00eau \\u0111\\u1ea7y l\\u00e3ng m\\u1ea1n, nh\\u1eb9 nh\\u00e0ng nh\\u01b0ng c\\u0169ng kh\\u00f4ng k\\u00e9m ph\\u1ea7n s\\u00e2u l\\u1eafng. Ph\\u1ea7n h\\u00f2a \\u00e2m ph\\u1ed1i kh\\u00ed tinh t\\u1ebf c\\u00e0ng l\\u00e0m n\\u1ed5i b\\u1eadt l\\u00ean v\\u1ebb \\u0111\\u1eb9p c\\u1ee7a ca kh\\u00fac, \\u0111\\u01b0a ng\\u01b0\\u1eddi nghe ch\\u00ecm \\u0111\\u1eafm v\\u00e0o kh\\u00f4ng gian \\u00e2m nh\\u1ea1c tr\\u1eef t\\u00ecnh \\u0111\\u1ea7y m\\u00ea ho\\u1eb7c. S\\u1eafp t\\u1edbi,\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860,\n 955,\n 244\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 17 } ] }, { "cell_type": "code", "source": [ "merged_df['diff'] = (merged_df['label_pred'] - merged_df['label_gold']).abs()\n", "merged_df" ], "metadata": { "id": "Lh8HQBtIrvFx", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "da599ab2-ead6-49ba-c67c-61a1d7195d93" }, "execution_count": 18, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id label_pred \\\n", "0 VIE2 0 \n", "1 VIE4 0 \n", "2 VIE5 201 \n", "3 VIE7 789 \n", "4 VIE11 0 \n", "... ... ... \n", "49958 VIE99923 20 \n", "49959 VIE99924 39 \n", "49960 VIE99925 36 \n", "49961 VIE99927 431 \n", "49962 VIE99928 140 \n", "\n", " text label_gold diff \n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 0 0 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 0 0 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 210 9 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 789 0 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 0 0 \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 20 0 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 65 26 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 43 7 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 460 29 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 68 \n", "\n", "[49963 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel_predtextlabel_golddiff
0VIE20Theo thông tin cập nhật hôm sau, bị cáo L.T đã...00
1VIE40Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...00
2VIE5201Đây là vụ tấn công thứ hai trong vòng hai tuần...2109
3VIE7789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...7890
4VIE110Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...00
..................
49958VIE9992320đối đãi rất tốt và không hề có mâu thuẫn trước...200
49959VIE9992439Từ biên giới Trung Quốc, mất chừng ba giờ lái ...6526
49960VIE9992536Được biết, hiện nay dòng sản phẩm váy cưới dàn...437
49961VIE99927431Bà Haley: Hội đồng bảo vệ những nước vi phạm n...46029
49962VIE99928140Lời cáo buộc được đưa ra giữa lúc quan h...20868
\n", "

49963 rows × 5 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"VIE25022\",\n \"VIE45113\",\n \"VIE92676\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123,\n 1487,\n 1362\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\",\n \"H\\u1ed3ng y George Pell V\\u1ecb H\\u1ed3ng y c\\u1ea5p cao nh\\u1ea5t c\\u1ee7a Gi\\u00e1o h\\u00f4\\u0323i C\\u00f4ng gi\\u00e1o Australia n\\u00f3i \\u00f4ng \\u0111\\u00e3 b\\u1ecb 'th\\u00f3a m\\u1ea1 t\\u00ednh c\\u00e1ch' trong su\\u1ed1t cu\\u00f4\\u0323c \\u0111i\\u1ec1u tra k\\u00e9o d\\u00e0i hai n\\u0103m v\\u1ec1 nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c 'gi\\u1ea3' n\\u00e0y. \\u00d4ng cho bi\\u1ebft \\u0110\\u1ee9c Gi\\u00e1o ho\\u00e0ng \\u0111\\u00e3 cho ph\\u00e9p \\u00f4ng ngh\\u1ec9 \\u0111\\u1ec3 \\u0111\\u01b0\\u01a1ng \\u0111\\u1ea7u v\\u1edbi nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c n\\u00e0y. B\\u1ea3n \\u00e1n c\\u00f3 li\\u00ean quan t\\u1edbi nh\\u1eefng v\\u1ee5 vi\\u00ea\\u0323c \\u0111\\u01b0\\u1ee3c cho l\\u00e0 'c\\u00f3 t\\u00ednh l\\u1ecbch s\\u1eed', c\\u1ea3nh s\\u00e1t bang Victoria cho bi\\u1ebft. Vatican lo ng\\u1ea1i v\\u1ec1 gi\\u00e1m m\\u1ee5c b\\u1ecb giam t\\u1ea1i TQ Th\\u1ec9nh c\\u1ea7u Vatican 'minh x\\u00e9t' v\\u1ec1 c\\u1ef1u TGM Ki\\u1ec7t H\\u1ed3ng y Pell, n\\u0103m nay 76 tu\\u1ed5i, \\u0111ang l\\u00e0m vi\\u00ea\\u0323c t\\u1ea1i Vatican, v\\u00e0 \\u0111\\u01b0\\u1ee3c coi l\\u00e0 ch\\u1ee9c s\\u1eafc cao c\\u1ea5p th\\u1ee9 ba \\u1edf Vatican. Ph\\u00e1t bi\\u1ec3u t\\u1ea1i m\\u00f4\\u0323t cu\\u00f4\\u0323c h\\u1ecdp b\\u00e1o, H\\u1ed3ng y Pell cho bi\\u1ebft \\u00f4ng s\\u1ebd bay v\\u1ec1 Australia n\\u1ebfu c\\u00e1c b\\u00e1c s\\u0129 cho ph\\u00e9p. 'T\\u00f4i mong \\u0111\\u1ebfn ng\\u00e0y t\\u00f4i ra t\\u00f2a,' \\u00f4ng n\\u00f3i. 'T\\u00f4i kh\\u00f4ng ph\\u1ea1m nh\\u1eefng t\\u00f4\\u0323i n\\u00e0y, ch\\u00fang l\\u00e0 gi\\u1ea3. Chuy\\u00ea\\u0323n l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c l\\u00e0 gh\\u00ea t\\u1edfm \\u0111\\u1ed1i v\\u1edbi t\\u00f4i.' Gi\\u00e1o ho\\u00e0ng Francis n\\u1ed7 l\\u1ef1c c\\u1ea3i t\\u1ed5 Vatican T\\u00f2a th\\u00e1nh C\\u00f4ng gi\\u00e1o tr\\u00ean kh\\u1eafp th\\u1ebf gi\\u1edbi \\u0111\\u00e3 ph\\u1ea3i \\u0111\\u1ed1i m\\u0103\\u0323t v\\u1edbi h\\u00e0ng lo\\u1ea1t c\\u00e1o bu\\u00f4\\u0323c li\\u00ean quan \\u0111\\u1ebfn l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c do c\\u00e1c th\\u1ea7y tu g\\u00e2y ra. C\\u00f3 ngu\\u1ed3n tin n\\u00f3i r\\u1eb1ng nh\\u1eefng tr\\u01b0\\u1eddng h\\u1ee3p n\\u00e0y b\\u1ecb che \\u0111\\u00e2\\u0323y. Ph\\u00f3ng vi\\u00ean James Reynolds c\\u1ee7a BBC n\\u00f3i nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c n\\u00e0y \\u0111\\u01b0a T\\u00f2a Th\\u00e1nh v\\u00e0o m\\u00f4\\u0323t th\\u1eddi \\u0111i\\u1ec3m kh\\u00f3 kh\\u0103n. \\u0110\\u1ee9c Gi\\u00e1o ho\\u00e0ng Francis \\u0111ang n\\u1ed7 l\\u1ef1c c\\u1ea3i t\\u1ed5 Vatican v\\u00e0 n\\u00e2ng cao ti\\u00eau chu\\u1ea9n \\u0111\\u1ea1o \\u0111\\u1ee9c. Tuy nhi\\u00ean, v\\u1ee5 \\u00e1n c\\u1ee7a H\\u1ed3ng y Pell cho th\\u1ea5y nh\\u1eefng th\\u00e1ch th\\u1ee9c to l\\u1edbn m\\u00e0 Gi\\u00e1o h\\u1ed9i C\\u00f4ng gi\\u00e1o ph\\u1ea3i \\u0111\\u1ed1i m\\u1eb7t trong vi\\u1ec7c gi\\u1ea3i quy\\u1ebft v\\u1ea5n \\u0111\\u1ec1 l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c v\\u00e0 gi\\u00e0nh l\\u1ea1i l\\u00f2ng tin c\\u1ee7a c\\u00f4ng ch\\u00fang. Vi\\u1ec7c H\\u1ed3ng y Pell kh\\u1eb3ng \\u0111\\u1ecbnh s\\u1ef1 trong s\\u1ea1ch c\\u1ee7a m\\u00ecnh v\\u00e0 s\\u1eb5n s\\u00e0ng \\u0111\\u1ed1i m\\u1eb7t v\\u1edbi t\\u00f2a \\u00e1n cho th\\u1ea5y \\u00f4ng t\\u1ef1 tin v\\u00e0o s\\u1ef1 v\\u00f4 t\\u1ed9i c\\u1ee7a m\\u00ecnh. Tuy nhi\\u00ean, s\\u1ef1 vi\\u1ec7c c\\u0169ng \\u0111\\u1eb7t ra c\\u00e2u h\\u1ecfi v\\u1ec1 t\\u00ednh minh b\\u1ea1ch v\\u00e0 hi\\u1ec7u qu\\u1ea3 c\\u1ee7a c\\u00e1c cu\\u1ed9c \\u0111i\\u1ec1u tra n\\u1ed9i b\\u1ed9 trong Gi\\u00e1o h\\u1ed9i. T\\u01b0\\u01a1ng lai\",\n \"\\u201cCh\\u00e0o s\\u00e2n\\u201d b\\u1eb1ng ca kh\\u00fac c\\u00f3 giai \\u0111i\\u1ec7u nh\\u1eb9 nh\\u00e0ng \\u201cY\\u00eau th\\u01b0\\u01a1ng mong manh\\u201d, c\\u1eb7p Qu\\u00e1ch Ng\\u1ecdc Ngoan \\u2013 Ng\\u1ecdc Anh th\\u00eam m\\u1ed9t l\\u1ea7n n\\u1eefa kh\\u1eb3ng \\u0111\\u1ecbnh r\\u1eb1ng nh\\u1ea1c tr\\u1eef t\\u00ecnh m\\u1edbi l\\u00e0 s\\u1edf tr\\u01b0\\u1eddng c\\u1ee7a h\\u1ecd. Gi\\u1ecdng h\\u00e1t ng\\u1ecdt ng\\u00e0o, t\\u00ecnh c\\u1ea3m c\\u1ee7a Ng\\u1ecdc Anh h\\u00f2a quy\\u1ec7n c\\u00f9ng ch\\u1ea5t gi\\u1ecdng tr\\u1ea7m \\u1ea5m, nam t\\u00ednh c\\u1ee7a Qu\\u00e1ch Ng\\u1ecdc Ngoan \\u0111\\u00e3 t\\u1ea1o n\\u00ean m\\u1ed9t m\\u00e0n tr\\u00ecnh di\\u1ec5n \\u0111\\u1ea7y c\\u1ea3m x\\u00fac, ch\\u1ea1m \\u0111\\u1ebfn tr\\u00e1i tim kh\\u00e1n gi\\u1ea3. S\\u1ef1 k\\u1ebft h\\u1ee3p \\u0103n \\u00fd n\\u00e0y kh\\u00f4ng ch\\u1ec9 d\\u1eebng l\\u1ea1i \\u1edf gi\\u1ecdng h\\u00e1t m\\u00e0 c\\u00f2n th\\u1ec3 hi\\u1ec7n qua \\u00e1nh m\\u1eaft, c\\u1eed ch\\u1ec9 t\\u00ecnh t\\u1ee9 tr\\u00ean s\\u00e2n kh\\u1ea5u. D\\u01b0\\u1eddng nh\\u01b0 h\\u1ecd \\u0111ang k\\u1ec3 m\\u1ed9t c\\u00e2u chuy\\u1ec7n t\\u00ecnh y\\u00eau \\u0111\\u1ea7y l\\u00e3ng m\\u1ea1n, nh\\u1eb9 nh\\u00e0ng nh\\u01b0ng c\\u0169ng kh\\u00f4ng k\\u00e9m ph\\u1ea7n s\\u00e2u l\\u1eafng. Ph\\u1ea7n h\\u00f2a \\u00e2m ph\\u1ed1i kh\\u00ed tinh t\\u1ebf c\\u00e0ng l\\u00e0m n\\u1ed5i b\\u1eadt l\\u00ean v\\u1ebb \\u0111\\u1eb9p c\\u1ee7a ca kh\\u00fac, \\u0111\\u01b0a ng\\u01b0\\u1eddi nghe ch\\u00ecm \\u0111\\u1eafm v\\u00e0o kh\\u00f4ng gian \\u00e2m nh\\u1ea1c tr\\u1eef t\\u00ecnh \\u0111\\u1ea7y m\\u00ea ho\\u1eb7c. S\\u1eafp t\\u1edbi,\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860,\n 955,\n 244\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 244,\n \"min\": 0,\n \"max\": 16568,\n \"num_unique_values\": 832,\n \"samples\": [\n 1584,\n 1041,\n 646\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "merged_df['id'] = merged_df['id'].str[3:].astype(int)\n", "merged_df" ], "metadata": { "id": "zZf3ctI2rwvS", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "fa6d7cbd-d616-4f23-8821-37772111ddd9" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id label_pred text \\\n", "0 2 0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... \n", "1 4 0 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... \n", "2 5 201 Đây là vụ tấn công thứ hai trong vòng hai tuần... \n", "3 7 789 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... \n", "4 11 0 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... \n", "... ... ... ... \n", "49958 99923 20 đối đãi rất tốt và không hề có mâu thuẫn trước... \n", "49959 99924 39 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... \n", "49960 99925 36 Được biết, hiện nay dòng sản phẩm váy cưới dàn... \n", "49961 99927 431 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... \n", "49962 99928 140 Lời cáo buộc được đưa ra giữa lúc quan h... \n", "\n", " label_gold diff \n", "0 0 0 \n", "1 0 0 \n", "2 210 9 \n", "3 789 0 \n", "4 0 0 \n", "... ... ... \n", "49958 20 0 \n", "49959 65 26 \n", "49960 43 7 \n", "49961 460 29 \n", "49962 208 68 \n", "\n", "[49963 rows x 5 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idlabel_predtextlabel_golddiff
020Theo thông tin cập nhật hôm sau, bị cáo L.T đã...00
140Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...00
25201Đây là vụ tấn công thứ hai trong vòng hai tuần...2109
37789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...7890
4110Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...00
..................
499589992320đối đãi rất tốt và không hề có mâu thuẫn trước...200
499599992439Từ biên giới Trung Quốc, mất chừng ba giờ lái ...6526
499609992536Được biết, hiện nay dòng sản phẩm váy cưới dàn...437
4996199927431Bà Haley: Hội đồng bảo vệ những nước vi phạm n...46029
4996299928140Lời cáo buộc được đưa ra giữa lúc quan h...20868
\n", "

49963 rows × 5 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28749,\n \"min\": 2,\n \"max\": 99928,\n \"num_unique_values\": 49963,\n \"samples\": [\n 25022,\n 45113,\n 92676\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123,\n 1487,\n 1362\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\",\n \"H\\u1ed3ng y George Pell V\\u1ecb H\\u1ed3ng y c\\u1ea5p cao nh\\u1ea5t c\\u1ee7a Gi\\u00e1o h\\u00f4\\u0323i C\\u00f4ng gi\\u00e1o Australia n\\u00f3i \\u00f4ng \\u0111\\u00e3 b\\u1ecb 'th\\u00f3a m\\u1ea1 t\\u00ednh c\\u00e1ch' trong su\\u1ed1t cu\\u00f4\\u0323c \\u0111i\\u1ec1u tra k\\u00e9o d\\u00e0i hai n\\u0103m v\\u1ec1 nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c 'gi\\u1ea3' n\\u00e0y. \\u00d4ng cho bi\\u1ebft \\u0110\\u1ee9c Gi\\u00e1o ho\\u00e0ng \\u0111\\u00e3 cho ph\\u00e9p \\u00f4ng ngh\\u1ec9 \\u0111\\u1ec3 \\u0111\\u01b0\\u01a1ng \\u0111\\u1ea7u v\\u1edbi nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c n\\u00e0y. B\\u1ea3n \\u00e1n c\\u00f3 li\\u00ean quan t\\u1edbi nh\\u1eefng v\\u1ee5 vi\\u00ea\\u0323c \\u0111\\u01b0\\u1ee3c cho l\\u00e0 'c\\u00f3 t\\u00ednh l\\u1ecbch s\\u1eed', c\\u1ea3nh s\\u00e1t bang Victoria cho bi\\u1ebft. Vatican lo ng\\u1ea1i v\\u1ec1 gi\\u00e1m m\\u1ee5c b\\u1ecb giam t\\u1ea1i TQ Th\\u1ec9nh c\\u1ea7u Vatican 'minh x\\u00e9t' v\\u1ec1 c\\u1ef1u TGM Ki\\u1ec7t H\\u1ed3ng y Pell, n\\u0103m nay 76 tu\\u1ed5i, \\u0111ang l\\u00e0m vi\\u00ea\\u0323c t\\u1ea1i Vatican, v\\u00e0 \\u0111\\u01b0\\u1ee3c coi l\\u00e0 ch\\u1ee9c s\\u1eafc cao c\\u1ea5p th\\u1ee9 ba \\u1edf Vatican. Ph\\u00e1t bi\\u1ec3u t\\u1ea1i m\\u00f4\\u0323t cu\\u00f4\\u0323c h\\u1ecdp b\\u00e1o, H\\u1ed3ng y Pell cho bi\\u1ebft \\u00f4ng s\\u1ebd bay v\\u1ec1 Australia n\\u1ebfu c\\u00e1c b\\u00e1c s\\u0129 cho ph\\u00e9p. 'T\\u00f4i mong \\u0111\\u1ebfn ng\\u00e0y t\\u00f4i ra t\\u00f2a,' \\u00f4ng n\\u00f3i. 'T\\u00f4i kh\\u00f4ng ph\\u1ea1m nh\\u1eefng t\\u00f4\\u0323i n\\u00e0y, ch\\u00fang l\\u00e0 gi\\u1ea3. Chuy\\u00ea\\u0323n l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c l\\u00e0 gh\\u00ea t\\u1edfm \\u0111\\u1ed1i v\\u1edbi t\\u00f4i.' Gi\\u00e1o ho\\u00e0ng Francis n\\u1ed7 l\\u1ef1c c\\u1ea3i t\\u1ed5 Vatican T\\u00f2a th\\u00e1nh C\\u00f4ng gi\\u00e1o tr\\u00ean kh\\u1eafp th\\u1ebf gi\\u1edbi \\u0111\\u00e3 ph\\u1ea3i \\u0111\\u1ed1i m\\u0103\\u0323t v\\u1edbi h\\u00e0ng lo\\u1ea1t c\\u00e1o bu\\u00f4\\u0323c li\\u00ean quan \\u0111\\u1ebfn l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c do c\\u00e1c th\\u1ea7y tu g\\u00e2y ra. C\\u00f3 ngu\\u1ed3n tin n\\u00f3i r\\u1eb1ng nh\\u1eefng tr\\u01b0\\u1eddng h\\u1ee3p n\\u00e0y b\\u1ecb che \\u0111\\u00e2\\u0323y. Ph\\u00f3ng vi\\u00ean James Reynolds c\\u1ee7a BBC n\\u00f3i nh\\u1eefng c\\u00e1o bu\\u00f4\\u0323c n\\u00e0y \\u0111\\u01b0a T\\u00f2a Th\\u00e1nh v\\u00e0o m\\u00f4\\u0323t th\\u1eddi \\u0111i\\u1ec3m kh\\u00f3 kh\\u0103n. \\u0110\\u1ee9c Gi\\u00e1o ho\\u00e0ng Francis \\u0111ang n\\u1ed7 l\\u1ef1c c\\u1ea3i t\\u1ed5 Vatican v\\u00e0 n\\u00e2ng cao ti\\u00eau chu\\u1ea9n \\u0111\\u1ea1o \\u0111\\u1ee9c. Tuy nhi\\u00ean, v\\u1ee5 \\u00e1n c\\u1ee7a H\\u1ed3ng y Pell cho th\\u1ea5y nh\\u1eefng th\\u00e1ch th\\u1ee9c to l\\u1edbn m\\u00e0 Gi\\u00e1o h\\u1ed9i C\\u00f4ng gi\\u00e1o ph\\u1ea3i \\u0111\\u1ed1i m\\u1eb7t trong vi\\u1ec7c gi\\u1ea3i quy\\u1ebft v\\u1ea5n \\u0111\\u1ec1 l\\u1ea1m d\\u1ee5ng t\\u00ecnh d\\u1ee5c v\\u00e0 gi\\u00e0nh l\\u1ea1i l\\u00f2ng tin c\\u1ee7a c\\u00f4ng ch\\u00fang. Vi\\u1ec7c H\\u1ed3ng y Pell kh\\u1eb3ng \\u0111\\u1ecbnh s\\u1ef1 trong s\\u1ea1ch c\\u1ee7a m\\u00ecnh v\\u00e0 s\\u1eb5n s\\u00e0ng \\u0111\\u1ed1i m\\u1eb7t v\\u1edbi t\\u00f2a \\u00e1n cho th\\u1ea5y \\u00f4ng t\\u1ef1 tin v\\u00e0o s\\u1ef1 v\\u00f4 t\\u1ed9i c\\u1ee7a m\\u00ecnh. Tuy nhi\\u00ean, s\\u1ef1 vi\\u1ec7c c\\u0169ng \\u0111\\u1eb7t ra c\\u00e2u h\\u1ecfi v\\u1ec1 t\\u00ednh minh b\\u1ea1ch v\\u00e0 hi\\u1ec7u qu\\u1ea3 c\\u1ee7a c\\u00e1c cu\\u1ed9c \\u0111i\\u1ec1u tra n\\u1ed9i b\\u1ed9 trong Gi\\u00e1o h\\u1ed9i. T\\u01b0\\u01a1ng lai\",\n \"\\u201cCh\\u00e0o s\\u00e2n\\u201d b\\u1eb1ng ca kh\\u00fac c\\u00f3 giai \\u0111i\\u1ec7u nh\\u1eb9 nh\\u00e0ng \\u201cY\\u00eau th\\u01b0\\u01a1ng mong manh\\u201d, c\\u1eb7p Qu\\u00e1ch Ng\\u1ecdc Ngoan \\u2013 Ng\\u1ecdc Anh th\\u00eam m\\u1ed9t l\\u1ea7n n\\u1eefa kh\\u1eb3ng \\u0111\\u1ecbnh r\\u1eb1ng nh\\u1ea1c tr\\u1eef t\\u00ecnh m\\u1edbi l\\u00e0 s\\u1edf tr\\u01b0\\u1eddng c\\u1ee7a h\\u1ecd. Gi\\u1ecdng h\\u00e1t ng\\u1ecdt ng\\u00e0o, t\\u00ecnh c\\u1ea3m c\\u1ee7a Ng\\u1ecdc Anh h\\u00f2a quy\\u1ec7n c\\u00f9ng ch\\u1ea5t gi\\u1ecdng tr\\u1ea7m \\u1ea5m, nam t\\u00ednh c\\u1ee7a Qu\\u00e1ch Ng\\u1ecdc Ngoan \\u0111\\u00e3 t\\u1ea1o n\\u00ean m\\u1ed9t m\\u00e0n tr\\u00ecnh di\\u1ec5n \\u0111\\u1ea7y c\\u1ea3m x\\u00fac, ch\\u1ea1m \\u0111\\u1ebfn tr\\u00e1i tim kh\\u00e1n gi\\u1ea3. S\\u1ef1 k\\u1ebft h\\u1ee3p \\u0103n \\u00fd n\\u00e0y kh\\u00f4ng ch\\u1ec9 d\\u1eebng l\\u1ea1i \\u1edf gi\\u1ecdng h\\u00e1t m\\u00e0 c\\u00f2n th\\u1ec3 hi\\u1ec7n qua \\u00e1nh m\\u1eaft, c\\u1eed ch\\u1ec9 t\\u00ecnh t\\u1ee9 tr\\u00ean s\\u00e2n kh\\u1ea5u. D\\u01b0\\u1eddng nh\\u01b0 h\\u1ecd \\u0111ang k\\u1ec3 m\\u1ed9t c\\u00e2u chuy\\u1ec7n t\\u00ecnh y\\u00eau \\u0111\\u1ea7y l\\u00e3ng m\\u1ea1n, nh\\u1eb9 nh\\u00e0ng nh\\u01b0ng c\\u0169ng kh\\u00f4ng k\\u00e9m ph\\u1ea7n s\\u00e2u l\\u1eafng. Ph\\u1ea7n h\\u00f2a \\u00e2m ph\\u1ed1i kh\\u00ed tinh t\\u1ebf c\\u00e0ng l\\u00e0m n\\u1ed5i b\\u1eadt l\\u00ean v\\u1ebb \\u0111\\u1eb9p c\\u1ee7a ca kh\\u00fac, \\u0111\\u01b0a ng\\u01b0\\u1eddi nghe ch\\u00ecm \\u0111\\u1eafm v\\u00e0o kh\\u00f4ng gian \\u00e2m nh\\u1ea1c tr\\u1eef t\\u00ecnh \\u0111\\u1ea7y m\\u00ea ho\\u1eb7c. S\\u1eafp t\\u1edbi,\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860,\n 955,\n 244\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 244,\n \"min\": 0,\n \"max\": 16568,\n \"num_unique_values\": 832,\n \"samples\": [\n 1584,\n 1041,\n 646\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "merged_df = VIE_test.merge(merged_df, left_index=True, right_on='id', how='outer')\n", "merged_df" ], "metadata": { "id": "yzQw_jhDr1-E", "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "outputId": "3b3cd06f-9e24-45cb-8a2a-b46932aab2b0" }, "execution_count": 20, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count Split Location \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " Modified text New Word Count \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " New Char Count id label_pred \\\n", "0 292 2 0 \n", "1 407 4 0 \n", "2 1264 5 201 \n", "3 3916 7 789 \n", "4 369 11 0 \n", "... ... ... ... \n", "49958 298 99923 20 \n", "49959 695 99924 39 \n", "49960 657 99925 36 \n", "49961 2535 99927 431 \n", "49962 1116 99928 140 \n", "\n", " text label_gold diff \n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 0 0 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 0 0 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 210 9 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 789 0 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 0 0 \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 20 0 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 65 26 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 43 7 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 460 29 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 68 \n", "\n", "[49963 rows x 16 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiff
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...6829220Theo thông tin cập nhật hôm sau, bị cáo L.T đã...00
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...9040740Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...00
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...28212645201Đây là vụ tấn công thứ hai trong vòng hai tuần...2109
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...84739167789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...7890
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83369110Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...00
...................................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...692989992320đối đãi rất tốt và không hề có mâu thuẫn trước...200
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...1536959992439Từ biên giới Trung Quốc, mất chừng ba giờ lái ...6526
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...1446579992536Được biết, hiện nay dòng sản phẩm váy cưới dàn...437
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...562253599927431Bà Haley: Hội đồng bảo vệ những nước vi phạm n...46029
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...208111699928140Lời cáo buộc được đưa ra giữa lúc quan h...20868
\n", "

49963 rows × 16 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "merged_df", "summary": "{\n \"name\": \"merged_df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"VIE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Gemini-Pro-1.5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Rewritten\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"B\\u1ea1n l\\u00e0 \\u1ee9ng vi\\u00ean ch\\u01b0a c\\u00f3 kinh nghi\\u1ec7m l\\u00e0m vi\\u1ec7c cho n\\u00ean nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng th\\u01b0\\u1eddng h\\u1ecfi nh\\u1eefng ki\\u1ebfn th\\u1ee9c b\\u1ea1n \\u0111\\u00e3 \\u0111\\u01b0\\u1ee3c \\u0111\\u00e0o t\\u1ea1o v\\u00e0 s\\u1ef1 v\\u1eadn d\\u1ee5ng c\\u00e1c ki\\u1ebfn th\\u1ee9c \\u0111\\u00f3 v\\u00e0o trong th\\u1ef1c t\\u1ebf th\\u00f4ng qua c\\u00e1c t\\u00ecnh hu\\u1ed1ng th\\u1ef1c t\\u1ebf. B\\u00ean c\\u1ea1nh \\u0111\\u00f3, nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng s\\u1ebd ph\\u1ecfng v\\u1ea5n b\\u1ea1n \\u0111\\u1ec3 th\\u1ea5y \\u0111\\u01b0\\u1ee3c nh\\u1eefng kh\\u1ea3 n\\u0103ng \\u1ee9ng ph\\u00f3 c\\u1ee7a b\\u1ea1n th\\u00f4ng qua c\\u00e1c c\\u00e2u h\\u1ecfi. N\\u00f3 th\\u1ec3 hi\\u1ec7n t\\u01b0 duy s\\u00e1ng t\\u1ea1o v\\u00e0 th\\u00f4ng minh trong khi tr\\u1ea3 l\\u1eddi c\\u1ee7a b\\u1ea1n.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 707,\n \"min\": 7,\n \"max\": 21413,\n \"num_unique_values\": 2810,\n \"samples\": [\n 1814\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3224,\n \"min\": 33,\n \"max\": 95406,\n \"num_unique_values\": 7520,\n \"samples\": [\n 9869\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 415,\n \"min\": 1,\n \"max\": 18582,\n \"num_unique_values\": 2011,\n \"samples\": [\n 871\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1879,\n \"min\": 6,\n \"max\": 77183,\n \"num_unique_values\": 5623,\n \"samples\": [\n 7507\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28749,\n \"min\": 2,\n \"max\": 99928,\n \"num_unique_values\": 49963,\n \"samples\": [\n 25022\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 244,\n \"min\": 0,\n \"max\": 16568,\n \"num_unique_values\": 832,\n \"samples\": [\n 1584\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 20 } ] }, { "cell_type": "code", "source": [ "df = merged_df.copy()\n", "tokenizer = AutoTokenizer.from_pretrained(\"hyperonym/xlm-roberta-longformer-base-16384\") # USE SAME TOKENIZER AS USED IN TRAINING\n", "def check_split_position(row):\n", " text = row['Modified text']\n", " words = text.split()\n", " cumulative_tokens = 0\n", " for i in range(row['Split Location']): # Assuming Split Location is 1-based index\n", " tokens = tokenizer.tokenize(words[i])\n", " cumulative_tokens += len(tokens)\n", " if cumulative_tokens > 2048: # Check if we've already passed 2048 tokens\n", " return \"Outside\"\n", " return \"Inside\"\n", "df['Token Limit Check'] = df.apply(check_split_position, axis=1)\n", "df" ], "metadata": { "id": "n3EtjBRXr53j", "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "outputId": "17013236-4da8-47cd-c1fd-ffe6b06989b2" }, "execution_count": 21, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count Split Location \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " Modified text New Word Count \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " New Char Count id label_pred \\\n", "0 292 2 0 \n", "1 407 4 0 \n", "2 1264 5 201 \n", "3 3916 7 789 \n", "4 369 11 0 \n", "... ... ... ... \n", "49958 298 99923 20 \n", "49959 695 99924 39 \n", "49960 657 99925 36 \n", "49961 2535 99927 431 \n", "49962 1116 99928 140 \n", "\n", " text label_gold diff \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 0 0 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 0 0 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 210 9 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 789 0 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 0 0 \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 20 0 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 65 26 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 43 7 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 460 29 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 68 \n", "\n", " Token Limit Check \n", "0 Inside \n", "1 Inside \n", "2 Inside \n", "3 Inside \n", "4 Inside \n", "... ... \n", "49958 Inside \n", "49959 Inside \n", "49960 Inside \n", "49961 Inside \n", "49962 Inside \n", "\n", "[49963 rows x 17 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiffToken Limit Check
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...6829220Theo thông tin cập nhật hôm sau, bị cáo L.T đã...00Inside
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...9040740Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...00Inside
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...28212645201Đây là vụ tấn công thứ hai trong vòng hai tuần...2109Inside
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...84739167789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...7890Inside
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83369110Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...00Inside
......................................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...692989992320đối đãi rất tốt và không hề có mâu thuẫn trước...200Inside
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...1536959992439Từ biên giới Trung Quốc, mất chừng ba giờ lái ...6526Inside
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...1446579992536Được biết, hiện nay dòng sản phẩm váy cưới dàn...437Inside
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...562253599927431Bà Haley: Hội đồng bảo vệ những nước vi phạm n...46029Inside
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...208111699928140Lời cáo buộc được đưa ra giữa lúc quan h...20868Inside
\n", "

49963 rows × 17 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"VIE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Gemini-Pro-1.5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Rewritten\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"B\\u1ea1n l\\u00e0 \\u1ee9ng vi\\u00ean ch\\u01b0a c\\u00f3 kinh nghi\\u1ec7m l\\u00e0m vi\\u1ec7c cho n\\u00ean nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng th\\u01b0\\u1eddng h\\u1ecfi nh\\u1eefng ki\\u1ebfn th\\u1ee9c b\\u1ea1n \\u0111\\u00e3 \\u0111\\u01b0\\u1ee3c \\u0111\\u00e0o t\\u1ea1o v\\u00e0 s\\u1ef1 v\\u1eadn d\\u1ee5ng c\\u00e1c ki\\u1ebfn th\\u1ee9c \\u0111\\u00f3 v\\u00e0o trong th\\u1ef1c t\\u1ebf th\\u00f4ng qua c\\u00e1c t\\u00ecnh hu\\u1ed1ng th\\u1ef1c t\\u1ebf. B\\u00ean c\\u1ea1nh \\u0111\\u00f3, nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng s\\u1ebd ph\\u1ecfng v\\u1ea5n b\\u1ea1n \\u0111\\u1ec3 th\\u1ea5y \\u0111\\u01b0\\u1ee3c nh\\u1eefng kh\\u1ea3 n\\u0103ng \\u1ee9ng ph\\u00f3 c\\u1ee7a b\\u1ea1n th\\u00f4ng qua c\\u00e1c c\\u00e2u h\\u1ecfi. N\\u00f3 th\\u1ec3 hi\\u1ec7n t\\u01b0 duy s\\u00e1ng t\\u1ea1o v\\u00e0 th\\u00f4ng minh trong khi tr\\u1ea3 l\\u1eddi c\\u1ee7a b\\u1ea1n.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 707,\n \"min\": 7,\n \"max\": 21413,\n \"num_unique_values\": 2810,\n \"samples\": [\n 1814\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3224,\n \"min\": 33,\n \"max\": 95406,\n \"num_unique_values\": 7520,\n \"samples\": [\n 9869\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 415,\n \"min\": 1,\n \"max\": 18582,\n \"num_unique_values\": 2011,\n \"samples\": [\n 871\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1879,\n \"min\": 6,\n \"max\": 77183,\n \"num_unique_values\": 5623,\n \"samples\": [\n 7507\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28749,\n \"min\": 2,\n \"max\": 99928,\n \"num_unique_values\": 49963,\n \"samples\": [\n 25022\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 244,\n \"min\": 0,\n \"max\": 16568,\n \"num_unique_values\": 832,\n \"samples\": [\n 1584\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Token Limit Check\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Outside\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 21 } ] }, { "cell_type": "code", "source": [ "df['Token Limit Check'].value_counts()" ], "metadata": { "id": "gWfUnO17r8zb", "colab": { "base_uri": "https://localhost:8080/", "height": 178 }, "outputId": "90f5de64-80e4-43c1-b69e-3ef4c34ab76a" }, "execution_count": 22, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Token Limit Check\n", "Inside 49449\n", "Outside 514\n", "Name: count, dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
count
Token Limit Check
Inside49449
Outside514
\n", "

" ] }, "metadata": {}, "execution_count": 22 } ] }, { "cell_type": "code", "source": [ "df['Split Location'].max()" ], "metadata": { "id": "HdNmbX6yr_Lv", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "09f9c92f-5039-4183-cbba-ff17050cb989" }, "execution_count": 23, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "18184" ] }, "metadata": {}, "execution_count": 23 } ] }, { "cell_type": "code", "source": [ "# prompt: 2 new columns in df_demo as series/list of zeroes and ones : WORDS_REAL : length is row's Word Count, start with row's Split Location number of zeroes and end with ones : WORDS_PRED : lenght is rows' Word Count , start with row's label_pred number of zeroes and end wit ones\n", "def create_word_series(row, column_name):\n", " word_count = row['New Word Count']\n", " split_location = row[column_name]\n", " series = [0] * split_location + [1] * (word_count - split_location)\n", " return series\n", "df['WORDS_REAL'] = df.apply(create_word_series, axis=1, args=('Split Location',))\n", "df['WORDS_PRED'] = df.apply(create_word_series, axis=1, args=('label_pred',))\n", "df" ], "metadata": { "id": "R6waU4p-sCcV", "colab": { "base_uri": "https://localhost:8080/", "height": 753 }, "outputId": "1d4ff143-a18b-4be1-9fd8-e6010dddad32" }, "execution_count": 24, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count Split Location \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " Modified text New Word Count \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " New Char Count id label_pred \\\n", "0 292 2 0 \n", "1 407 4 0 \n", "2 1264 5 201 \n", "3 3916 7 789 \n", "4 369 11 0 \n", "... ... ... ... \n", "49958 298 99923 20 \n", "49959 695 99924 39 \n", "49960 657 99925 36 \n", "49961 2535 99927 431 \n", "49962 1116 99928 140 \n", "\n", " text label_gold diff \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 0 0 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 0 0 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 210 9 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 789 0 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 0 0 \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 20 0 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 65 26 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 43 7 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 460 29 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 68 \n", "\n", " Token Limit Check WORDS_REAL \\\n", "0 Inside [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "1 Inside [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "2 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 Inside [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "... ... ... \n", "49958 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49959 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49960 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49961 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49962 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED \n", "0 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "1 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "... ... \n", "49958 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49959 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49960 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49961 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49962 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", "[49963 rows x 19 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word CountNew Char Countidlabel_predtextlabel_golddiffToken Limit CheckWORDS_REALWORDS_PRED
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...6829220Theo thông tin cập nhật hôm sau, bị cáo L.T đã...00Inside[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...9040740Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...00Inside[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...28212645201Đây là vụ tấn công thứ hai trong vòng hai tuần...2109Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...84739167789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...7890Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83369110Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...00Inside[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
............................................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...692989992320đối đãi rất tốt và không hề có mâu thuẫn trước...200Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...1536959992439Từ biên giới Trung Quốc, mất chừng ba giờ lái ...6526Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...1446579992536Được biết, hiện nay dòng sản phẩm váy cưới dàn...437Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...562253599927431Bà Haley: Hội đồng bảo vệ những nước vi phạm n...46029Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...208111699928140Lời cáo buộc được đưa ra giữa lúc quan h...20868Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
\n", "

49963 rows × 19 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 49963,\n \"fields\": [\n {\n \"column\": \"ISO\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"VIE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"LLM used\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Gemini-Pro-1.5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Rewritten\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Data Split\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"Test\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49963,\n \"samples\": [\n \"B\\u1ea1n l\\u00e0 \\u1ee9ng vi\\u00ean ch\\u01b0a c\\u00f3 kinh nghi\\u1ec7m l\\u00e0m vi\\u1ec7c cho n\\u00ean nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng th\\u01b0\\u1eddng h\\u1ecfi nh\\u1eefng ki\\u1ebfn th\\u1ee9c b\\u1ea1n \\u0111\\u00e3 \\u0111\\u01b0\\u1ee3c \\u0111\\u00e0o t\\u1ea1o v\\u00e0 s\\u1ef1 v\\u1eadn d\\u1ee5ng c\\u00e1c ki\\u1ebfn th\\u1ee9c \\u0111\\u00f3 v\\u00e0o trong th\\u1ef1c t\\u1ebf th\\u00f4ng qua c\\u00e1c t\\u00ecnh hu\\u1ed1ng th\\u1ef1c t\\u1ebf. B\\u00ean c\\u1ea1nh \\u0111\\u00f3, nh\\u00e0 tuy\\u1ec3n d\\u1ee5ng s\\u1ebd ph\\u1ecfng v\\u1ea5n b\\u1ea1n \\u0111\\u1ec3 th\\u1ea5y \\u0111\\u01b0\\u1ee3c nh\\u1eefng kh\\u1ea3 n\\u0103ng \\u1ee9ng ph\\u00f3 c\\u1ee7a b\\u1ea1n th\\u00f4ng qua c\\u00e1c c\\u00e2u h\\u1ecfi. N\\u00f3 th\\u1ec3 hi\\u1ec7n t\\u01b0 duy s\\u00e1ng t\\u1ea1o v\\u00e0 th\\u00f4ng minh trong khi tr\\u1ea3 l\\u1eddi c\\u1ee7a b\\u1ea1n.\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 707,\n \"min\": 7,\n \"max\": 21413,\n \"num_unique_values\": 2810,\n \"samples\": [\n 1814\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Original Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3224,\n \"min\": 33,\n \"max\": 95406,\n \"num_unique_values\": 7520,\n \"samples\": [\n 9869\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Split Location\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Modified text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Word Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 415,\n \"min\": 1,\n \"max\": 18582,\n \"num_unique_values\": 2011,\n \"samples\": [\n 871\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"New Char Count\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1879,\n \"min\": 6,\n \"max\": 77183,\n \"num_unique_values\": 5623,\n \"samples\": [\n 7507\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"id\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 28749,\n \"min\": 2,\n \"max\": 99928,\n \"num_unique_values\": 49963,\n \"samples\": [\n 25022\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_pred\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 263,\n \"min\": 0,\n \"max\": 1887,\n \"num_unique_values\": 1621,\n \"samples\": [\n 123\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 49953,\n \"samples\": [\n \"L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean n\\u00f3i \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecda t\\u1ea9y chay cu\\u1ed9c g\\u1eb7p\\u201d n\\u1ebfu nh\\u01b0 c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed do h\\u1ecd l\\u1ef1a ch\\u1ecdn kh\\u00f4ng c\\u00f3 m\\u1eb7t trong bu\\u1ed5i ti\\u1ebfp ki\\u1ebfn. Hi\\u1ec7n nay c\\u00e1c quan ch\\u1ee9c Asean \\u0111ang b\\u00e0n lu\\u1eadn v\\u1ec1 vi\\u1ec7c h\\u00ecnh th\\u00e0nh m\\u1ed9t t\\u1ed5 ch\\u1ee9c theo d\\u00f5i nh\\u00e2n quy\\u1ec1n cho Asean v\\u00e0 c\\u00e1c \\u0111i\\u1ec1u kho\\u1ea3n ho\\u1ea1t \\u0111\\u1ed9ng cho n\\u00f3. Trong Hi\\u1ebfn L\\u00e3nh \\u0111\\u1ea1o tham d\\u1ef1 h\\u1ed9i ngh\\u1ecb th\\u01b0\\u1ee3ng \\u0111\\u1ec9nh Asean l\\u1ea7n th\\u1ee9 14, h\\u1ecdp t\\u1ea1i Huahin Th\\u00e1i Lan, d\\u00e0nh ra 30 ph\\u00fat trong ng\\u00e0y th\\u1ee9 B\\u1ea3y (28.02) \\u0111\\u1ec3 g\\u1eb7p \\u0111\\u1ea1i di\\u1ec7n c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n c\\u1eed. Ph\\u00e1i vi\\u00ean cho bi\\u1ebft \\u0111\\u00e2y l\\u00e0 c\\u01a1 h\\u1ed9i cho c\\u00e1c t\\u1ed5 ch\\u1ee9c d\\u00e2n s\\u1ef1 \\u0111\\u1ec1 \\u0111\\u1ea1t ki\\u1ebfn ngh\\u1ecb c\\u1ee7a h\\u1ecd l\\u00ean c\\u00e1c nh\\u00e0 l\\u00e3nh \\u0111\\u1ea1o Asean, ch\\u1ee7 y\\u1ebfu li\\u00ean quan \\u0111\\u1ebfn c\\u00e1c v\\u1ea5n \\u0111\\u1ec1 d\\u00e2n ch\\u1ee7, d\\u00e2n sinh v\\u00e0 quy\\u1ec1n con ng\\u01b0\\u1eddi. B\\u1ea3n tin tr\\u00ean t\\u1edd b\\u00e1o ti\\u1ebfng Anh n\\u00f3i Mi\\u1ebfn \\u0110i\\u1ec7n v\\u00e0 Campuchia \\u201cd\\u1ecd\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label_gold\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 405,\n \"min\": 0,\n \"max\": 18184,\n \"num_unique_values\": 1920,\n \"samples\": [\n 860\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"diff\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 244,\n \"min\": 0,\n \"max\": 16568,\n \"num_unique_values\": 832,\n \"samples\": [\n 1584\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Token Limit Check\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Outside\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORDS_REAL\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"WORDS_PRED\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "# prompt: 4 new columns : ROW_TP, ROW_FP, ROW_TN , ROW_FN : based on zeroes and ones in WORDS_PRED , WORDS_REAL . note : lenght of series is diff in each row\n", "def calculate_metrics(row):\n", " tp = 0\n", " fp = 0\n", " tn = 0\n", " fn = 0\n", " for i in range(len(row['WORDS_REAL'])):\n", " if row['WORDS_REAL'][i] == 1 and row['WORDS_PRED'][i] == 1:\n", " tp += 1\n", " elif row['WORDS_REAL'][i] == 0 and row['WORDS_PRED'][i] == 1:\n", " fp += 1\n", " elif row['WORDS_REAL'][i] == 0 and row['WORDS_PRED'][i] == 0:\n", " tn += 1\n", " elif row['WORDS_REAL'][i] == 1 and row['WORDS_PRED'][i] == 0:\n", " fn += 1\n", " return tp, fp, tn, fn\n", "df[['ROW_TP', 'ROW_FP', 'ROW_TN', 'ROW_FN']] = df.apply(calculate_metrics, axis=1, result_type='expand')\n", "df" ], "metadata": { "id": "WI83u4mjsEvy", "colab": { "base_uri": "https://localhost:8080/", "height": 805 }, "outputId": "92cb66bd-14e0-4bdd-838f-6acfa0ca1ae5" }, "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count Split Location \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " Modified text New Word Count ... \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 ... \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 ... \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 ... \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 ... \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 ... \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 ... \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 ... \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 ... \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 ... \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 ... \n", "\n", " text label_gold diff \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 0 0 \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 0 0 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 210 9 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 789 0 \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 0 0 \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 20 0 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 65 26 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 43 7 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 460 29 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 68 \n", "\n", " Token Limit Check WORDS_REAL \\\n", "0 Inside [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "1 Inside [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "2 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 Inside [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "... ... ... \n", "49958 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49959 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49960 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49961 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49962 Inside [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED ROW_TP ROW_FP ROW_TN \\\n", "0 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 68 0 0 \n", "1 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 90 0 0 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 72 9 201 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 58 0 789 \n", "4 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 83 0 0 \n", "... ... ... ... ... \n", "49958 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 49 0 20 \n", "49959 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 88 26 39 \n", "49960 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 101 7 36 \n", "49961 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 102 29 431 \n", "49962 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0 68 140 \n", "\n", " ROW_FN \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "49958 0 \n", "49959 0 \n", "49960 0 \n", "49961 0 \n", "49962 0 \n", "\n", "[49963 rows x 23 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...textlabel_golddiffToken Limit CheckWORDS_REALWORDS_PREDROW_TPROW_FPROW_TNROW_FN
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...68...Theo thông tin cập nhật hôm sau, bị cáo L.T đã...00Inside[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...68000
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...90...Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...00Inside[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...90000
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...282...Đây là vụ tấn công thứ hai trong vòng hai tuần...2109Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...7292010
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...847...Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...7890Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...5807890
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83...Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...00Inside[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...83000
..................................................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...69...đối đãi rất tốt và không hề có mâu thuẫn trước...200Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...490200
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...153...Từ biên giới Trung Quốc, mất chừng ba giờ lái ...6526Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...8826390
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...144...Được biết, hiện nay dòng sản phẩm váy cưới dàn...437Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...1017360
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...562...Bà Haley: Hội đồng bảo vệ những nước vi phạm n...46029Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...102294310
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...208...Lời cáo buộc được đưa ra giữa lúc quan h...20868Inside[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...0681400
\n", "

49963 rows × 23 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "code", "source": [ "# prompt: 4 new column : ROW_ACC , ROW_PREC , ROW_REC , ROW_F1 based on ROW_TN,ROW_TP,ROW_FN,ROW_FP\n", "def calculate_row_metrics(row):\n", " tp = row['ROW_TP']\n", " fp = row['ROW_FP']\n", " tn = row['ROW_TN']\n", " fn = row['ROW_FN']\n", " if (tp + tn + fp + fn) == 0:\n", " accuracy = 0\n", " else:\n", " accuracy = (tp + tn) / (tp + tn + fp + fn)\n", " if (tp + fp) == 0:\n", " precision = 0\n", " else:\n", " precision = tp / (tp + fp)\n", " if (tp + fn) == 0:\n", " recall = 0\n", " else:\n", " recall = tp / (tp + fn)\n", " if (precision + recall) == 0:\n", " f1_score = 0\n", " else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " return accuracy, precision, recall, f1_score\n", "df[['ROW_ACC', 'ROW_PREC', 'ROW_REC', 'ROW_F1']] = df.apply(calculate_row_metrics, axis=1, result_type='expand')\n", "df" ], "metadata": { "id": "6PnV_NwCsJNG", "colab": { "base_uri": "https://localhost:8080/", "height": 649 }, "outputId": "250a17db-1a30-4c6b-873c-f94d4689d223" }, "execution_count": 26, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count Split Location \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " Modified text New Word Count ... \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 ... \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 ... \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 ... \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 ... \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 ... \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 ... \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 ... \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 ... \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 ... \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 ... \n", "\n", " WORDS_REAL \\\n", "0 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "1 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... \n", "... ... \n", "49958 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49959 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49960 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49961 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "49962 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "\n", " WORDS_PRED ROW_TP ROW_FP \\\n", "0 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 68 0 \n", "1 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 90 0 \n", "2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 72 9 \n", "3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 58 0 \n", "4 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... 83 0 \n", "... ... ... ... \n", "49958 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 49 0 \n", "49959 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 88 26 \n", "49960 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 101 7 \n", "49961 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 102 29 \n", "49962 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 0 68 \n", "\n", " ROW_TN ROW_FN ROW_ACC ROW_PREC ROW_REC ROW_F1 \n", "0 0 0 1.000000 1.000000 1.0 1.000000 \n", "1 0 0 1.000000 1.000000 1.0 1.000000 \n", "2 201 0 0.968085 0.888889 1.0 0.941176 \n", "3 789 0 1.000000 1.000000 1.0 1.000000 \n", "4 0 0 1.000000 1.000000 1.0 1.000000 \n", "... ... ... ... ... ... ... \n", "49958 20 0 1.000000 1.000000 1.0 1.000000 \n", "49959 39 0 0.830065 0.771930 1.0 0.871287 \n", "49960 36 0 0.951389 0.935185 1.0 0.966507 \n", "49961 431 0 0.948399 0.778626 1.0 0.875536 \n", "49962 140 0 0.673077 0.000000 0.0 0.000000 \n", "\n", "[49963 rows x 27 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...WORDS_REALWORDS_PREDROW_TPROW_FPROW_TNROW_FNROW_ACCROW_PRECROW_RECROW_F1
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...68...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...680001.0000001.0000001.01.000000
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...90...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...900001.0000001.0000001.01.000000
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...282...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...72920100.9680850.8888891.00.941176
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...847...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...58078901.0000001.0000001.01.000000
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...830001.0000001.0000001.01.000000
..................................................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...69...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...4902001.0000001.0000001.01.000000
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...153...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...88263900.8300650.7719301.00.871287
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...144...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...10173600.9513890.9351851.00.966507
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...562...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...1022943100.9483990.7786261.00.875536
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...208...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...06814000.6730770.0000000.00.000000
\n", "

49963 rows × 27 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 26 } ] }, { "cell_type": "code", "source": [ "def calculate_percentage_of_ones(row):\n", " series = row['WORDS_PRED']\n", " if len(series) == 0:\n", " return 0\n", " else:\n", " return sum(series) / len(series)\n", "df[\"Label : 1\"] = df.apply(calculate_percentage_of_ones, axis=1)\n", "df[\"Label : 0\"] = 1.0 - df[\"Label : 1\"]\n", "df" ], "metadata": { "id": "Yp3FO_HVsLiA", "colab": { "base_uri": "https://localhost:8080/", "height": 614 }, "outputId": "bfa9e688-6829-48d9-b91c-da4e7cc1448b" }, "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " ISO LLM used Type Data Split \\\n", "0 VIE GPT-4o Rewritten Test \n", "1 VIE Claude-Sonnet-3.5 Rewritten Test \n", "2 VIE GPT-4o Partial Test \n", "3 VIE Claude-Sonnet-3.5 Partial Test \n", "4 VIE Claude-Haiku-3.5 Rewritten Test \n", "... ... ... ... ... \n", "49958 VIE Gemini-Flash-1.5 Partial Test \n", "49959 VIE Amazon-Nova-Pro-1.0 Partial Test \n", "49960 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49961 VIE Amazon-Nova-Lite-1.0 Partial Test \n", "49962 VIE Gemini-Flash-1.5 Unchanged Test \n", "\n", " Original text Original Word Count \\\n", "0 Theo nội dung cáo trạng công bố tại phiên tòa,... 93 \n", "1 Tại Trường tiểu học Lê Đình Chinh (Q.11), bên ... 87 \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 400 \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 1544 \n", "4 - Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ... 106 \n", "... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 67 \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 148 \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 86 \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 827 \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 \n", "\n", " Original Char Count Split Location \\\n", "0 411 0 \n", "1 393 0 \n", "2 1742 210 \n", "3 7097 789 \n", "4 491 0 \n", "... ... ... \n", "49958 286 20 \n", "49959 677 65 \n", "49960 384 43 \n", "49961 3774 460 \n", "49962 1116 208 \n", "\n", " Modified text New Word Count ... \\\n", "0 Theo thông tin cập nhật hôm sau, bị cáo L.T đã... 68 ... \n", "1 Ngày hôm sau tại Trường tiểu học Lê Đình Chinh... 90 ... \n", "2 Đây là vụ tấn công thứ hai trong vòng hai tuần... 282 ... \n", "3 Croatia bỏ lỡ cơ hội viết nên trang sử mới cho... 847 ... \n", "4 Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ... 83 ... \n", "... ... ... ... \n", "49958 đối đãi rất tốt và không hề có mâu thuẫn trước... 69 ... \n", "49959 Từ biên giới Trung Quốc, mất chừng ba giờ lái ... 153 ... \n", "49960 Được biết, hiện nay dòng sản phẩm váy cưới dàn... 144 ... \n", "49961 Bà Haley: Hội đồng bảo vệ những nước vi phạm n... 562 ... \n", "49962 Lời cáo buộc được đưa ra giữa lúc quan h... 208 ... \n", "\n", " ROW_TP ROW_FP ROW_TN ROW_FN ROW_ACC ROW_PREC ROW_REC ROW_F1 \\\n", "0 68 0 0 0 1.000000 1.000000 1.0 1.000000 \n", "1 90 0 0 0 1.000000 1.000000 1.0 1.000000 \n", "2 72 9 201 0 0.968085 0.888889 1.0 0.941176 \n", "3 58 0 789 0 1.000000 1.000000 1.0 1.000000 \n", "4 83 0 0 0 1.000000 1.000000 1.0 1.000000 \n", "... ... ... ... ... ... ... ... ... \n", "49958 49 0 20 0 1.000000 1.000000 1.0 1.000000 \n", "49959 88 26 39 0 0.830065 0.771930 1.0 0.871287 \n", "49960 101 7 36 0 0.951389 0.935185 1.0 0.966507 \n", "49961 102 29 431 0 0.948399 0.778626 1.0 0.875536 \n", "49962 0 68 140 0 0.673077 0.000000 0.0 0.000000 \n", "\n", " Label : 1 Label : 0 \n", "0 1.000000 0.000000 \n", "1 1.000000 0.000000 \n", "2 0.287234 0.712766 \n", "3 0.068477 0.931523 \n", "4 1.000000 0.000000 \n", "... ... ... \n", "49958 0.710145 0.289855 \n", "49959 0.745098 0.254902 \n", "49960 0.750000 0.250000 \n", "49961 0.233096 0.766904 \n", "49962 0.326923 0.673077 \n", "\n", "[49963 rows x 29 columns]" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ISOLLM usedTypeData SplitOriginal textOriginal Word CountOriginal Char CountSplit LocationModified textNew Word Count...ROW_TPROW_FPROW_TNROW_FNROW_ACCROW_PRECROW_RECROW_F1Label : 1Label : 0
0VIEGPT-4oRewrittenTestTheo nội dung cáo trạng công bố tại phiên tòa,...934110Theo thông tin cập nhật hôm sau, bị cáo L.T đã...68...680001.0000001.0000001.01.0000001.0000000.000000
1VIEClaude-Sonnet-3.5RewrittenTestTại Trường tiểu học Lê Đình Chinh (Q.11), bên ...873930Ngày hôm sau tại Trường tiểu học Lê Đình Chinh...90...900001.0000001.0000001.01.0000001.0000000.000000
2VIEGPT-4oPartialTestĐây là vụ tấn công thứ hai trong vòng hai tuần...4001742210Đây là vụ tấn công thứ hai trong vòng hai tuần...282...72920100.9680850.8888891.00.9411760.2872340.712766
3VIEClaude-Sonnet-3.5PartialTestCroatia bỏ lỡ cơ hội viết nên trang sử mới cho...15447097789Croatia bỏ lỡ cơ hội viết nên trang sử mới cho...847...58078901.0000001.0000001.01.0000000.0684770.931523
4VIEClaude-Haiku-3.5RewrittenTest- Tôi năm nay 41 tuổi, vừa mới cưới vợ được 4 ...1064910Cập nhật ngày hôm sau:\\n\\nSau khi áp dụng lời ...83...830001.0000001.0000001.01.0000001.0000000.000000
..................................................................
49958VIEGemini-Flash-1.5PartialTestđối đãi rất tốt và không hề có mâu thuẫn trước...6728620đối đãi rất tốt và không hề có mâu thuẫn trước...69...4902001.0000001.0000001.01.0000000.7101450.289855
49959VIEAmazon-Nova-Pro-1.0PartialTestTừ biên giới Trung Quốc, mất chừng ba giờ lái ...14867765Từ biên giới Trung Quốc, mất chừng ba giờ lái ...153...88263900.8300650.7719301.00.8712870.7450980.254902
49960VIEAmazon-Nova-Lite-1.0PartialTestĐược biết, hiện nay dòng sản phẩm váy cưới dàn...8638443Được biết, hiện nay dòng sản phẩm váy cưới dàn...144...10173600.9513890.9351851.00.9665070.7500000.250000
49961VIEAmazon-Nova-Lite-1.0PartialTestBà Haley: Hội đồng bảo vệ những nước vi phạm n...8273774460Bà Haley: Hội đồng bảo vệ những nước vi phạm n...562...1022943100.9483990.7786261.00.8755360.2330960.766904
49962VIEGemini-Flash-1.5UnchangedTestLời cáo buộc được đưa ra giữa lúc quan h...2081116208Lời cáo buộc được đưa ra giữa lúc quan h...208...06814000.6730770.0000000.00.0000000.3269230.673077
\n", "

49963 rows × 29 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df" } }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "df_unchanged = df[df['Type'] == 'Unchanged']\n", "df_rewritten = df[df['Type'] == 'Rewritten']\n", "df_partial = df[df['Type'] == 'Partial']\n", "print(\"######################################\")\n", "print(\" METRICS BY TEXT TYPE : \")\n", "print(\"######################################\")\n", "AVG_ACC = df_partial['ROW_ACC'].mean()\n", "AVG_PREC = df_partial['ROW_PREC'].mean()\n", "AVG_REC = df_partial['ROW_REC'].mean()\n", "AVG_F1 = df_partial['ROW_F1'].mean()\n", "print(\"Partial Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Partial Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Partial Cases : Average Recall : \" , AVG_REC )\n", "print(\"Partial Cases : Average F1-score : \" , AVG_F1 )\n", "AVG_ACC = df_unchanged['ROW_ACC'].mean()\n", "AVG_PREC = df_unchanged['ROW_PREC'].mean()\n", "AVG_REC = df_unchanged['ROW_REC'].mean()\n", "AVG_F1 = df_unchanged['ROW_F1'].mean()\n", "print(\"Unchanged Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Unchanged Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Unchanged Cases : Average Recall : \" , AVG_REC )\n", "print(\"Unchanged Cases : Average F1-score : \" , AVG_F1 )\n", "AVG_ACC = df_rewritten['ROW_ACC'].mean()\n", "AVG_PREC = df_rewritten['ROW_PREC'].mean()\n", "AVG_REC = df_rewritten['ROW_REC'].mean()\n", "AVG_F1 = df_rewritten['ROW_F1'].mean()\n", "print(\"Rewritten Cases : Average Accuracy : \" , AVG_ACC )\n", "print(\"Rewritten Cases : Average Precision : \" , AVG_PREC)\n", "print(\"Rewritten Cases : Average Recall : \" , AVG_REC )\n", "print(\"Rewritten Cases : Average F1-score : \" , AVG_F1 )\n", "print(\"######################################\")\n", "print(\" METRICS OVERALL : \")\n", "print(\"######################################\")\n", "# prompt: print AVG_ACC, AVG_PREC , AVG_REC , AVG_F1 as mean of values in columns ROW_ACC , ROW_REC , ROW_PREC , ROW_F1 from dataframe df\n", "AVG_ACC = df['ROW_ACC'].mean()\n", "AVG_PREC = df['ROW_PREC'].mean()\n", "AVG_REC = df['ROW_REC'].mean()\n", "AVG_F1 = df['ROW_F1'].mean()\n", "print(\"All Cases : Average Accuracy:\", AVG_ACC)\n", "print(\"All Cases : Average Precision:\", AVG_PREC)\n", "print(\"All Cases : Average Recall:\", AVG_REC)\n", "print(\"All Cases : Average F1-score:\", AVG_F1)\n", "print(\"######################################\")\n", "# prompt: Also print overall ACC,PREC,REC,F1 based on values of columns ROW_TN,ROW_TP,ROW_FN,ROW_FP\n", "total_tp = df['ROW_TP'].sum()\n", "total_fp = df['ROW_FP'].sum()\n", "total_tn = df['ROW_TN'].sum()\n", "total_fn = df['ROW_FN'].sum()\n", "if (total_tp + total_tn + total_fp + total_fn) == 0:\n", " accuracy = 0\n", "else:\n", " accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)\n", "if (total_tp + total_fp) == 0:\n", " precision = 0\n", "else:\n", " precision = total_tp / (total_tp + total_fp)\n", "if (total_tp + total_fn) == 0:\n", " recall = 0\n", "else:\n", " recall = total_tp / (total_tp + total_fn)\n", "if (precision + recall) == 0:\n", " f1_score = 0\n", "else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", "print(\"Overall Accuracy:\", accuracy)\n", "print(\"Overall Precision:\", precision)\n", "print(\"Overall Recall:\", recall)\n", "print(\"Overall F1-score:\", f1_score)" ], "metadata": { "id": "cuuc9gPjsU_T", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "83877eb3-b0ae-451d-fa50-2e228574794b" }, "execution_count": 28, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "######################################\n", " METRICS BY TEXT TYPE : \n", "######################################\n", "Partial Cases : Average Accuracy : 0.9472152172054468\n", "Partial Cases : Average Precision : 0.9148684476710557\n", "Partial Cases : Average Recall : 0.9852149394823919\n", "Partial Cases : Average F1-score : 0.9350478741905215\n", "Unchanged Cases : Average Accuracy : 0.718580180165642\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.965711561183783\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.965711561183783\n", "Rewritten Cases : Average F1-score : 0.9781793315167738\n", "######################################\n", " METRICS OVERALL : \n", "######################################\n", "All Cases : Average Accuracy: 0.9261517996602255\n", "All Cases : Average Precision: 0.8316906407835832\n", "All Cases : Average Recall: 0.8845818197287051\n", "All Cases : Average F1-score: 0.8456707097405866\n", "######################################\n", "Overall Accuracy: 0.8886313585436327\n", "Overall Precision: 0.7535051520925843\n", "Overall Recall: 0.9846524309240048\n", "Overall F1-score: 0.8537093379463098\n" ] } ] }, { "cell_type": "code", "source": [ "llm_values = df['LLM used'].unique()\n", "for llm in llm_values:\n", " print(\"LLM used:\", llm)\n", " df_llm = df[df['LLM used'] == llm]\n", " for text_type in ['Partial', 'Unchanged', 'Rewritten']:\n", " df_subset = df_llm[df_llm['Type'] == text_type]\n", " if df_subset.empty:\n", " continue\n", " avg_acc = df_subset['ROW_ACC'].mean()\n", " avg_prec = df_subset['ROW_PREC'].mean()\n", " avg_rec = df_subset['ROW_REC'].mean()\n", " avg_f1 = df_subset['ROW_F1'].mean()\n", " print(f\"{text_type} Cases : Average Accuracy : {avg_acc}\")\n", " print(f\"{text_type} Cases : Average Precision : {avg_prec}\")\n", " print(f\"{text_type} Cases : Average Recall : {avg_rec}\")\n", " print(f\"{text_type} Cases : Average F1-score : {avg_f1}\")\n", " print(\"######################################\")" ], "metadata": { "id": "9PwzmDF9xJzl", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "1123eda7-00ea-4bd0-e110-09ba26cfe47e" }, "execution_count": 29, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LLM used: GPT-4o\n", "Partial Cases : Average Accuracy : 0.9171231464884239\n", "Partial Cases : Average Precision : 0.8604107629099346\n", "Partial Cases : Average Recall : 0.9857228281015794\n", "Partial Cases : Average F1-score : 0.9044538270101237\n", "Unchanged Cases : Average Accuracy : 0.7165614097587162\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9365721243421403\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9365721243421403\n", "Rewritten Cases : Average F1-score : 0.9583303904526252\n", "######################################\n", "LLM used: Claude-Sonnet-3.5\n", "Partial Cases : Average Accuracy : 0.9796492332061371\n", "Partial Cases : Average Precision : 0.9582938070412943\n", "Partial Cases : Average Recall : 0.9990514594085332\n", "Partial Cases : Average F1-score : 0.9700757115865892\n", "Unchanged Cases : Average Accuracy : 0.7038545396972906\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9990740740740741\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9990740740740741\n", "Rewritten Cases : Average F1-score : 0.9993975903614457\n", "######################################\n", "LLM used: Claude-Haiku-3.5\n", "Partial Cases : Average Accuracy : 0.983898010462177\n", "Partial Cases : Average Precision : 0.963989660851932\n", "Partial Cases : Average Recall : 0.9959532222702651\n", "Partial Cases : Average F1-score : 0.9710351462902974\n", "Unchanged Cases : Average Accuracy : 0.711050974123733\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9042495281495535\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9042495281495535\n", "Rewritten Cases : Average F1-score : 0.9389683925792911\n", "######################################\n", "LLM used: Gemini-Flash-1.5\n", "Partial Cases : Average Accuracy : 0.9677641265274619\n", "Partial Cases : Average Precision : 0.9439636766702307\n", "Partial Cases : Average Recall : 0.9949414536578454\n", "Partial Cases : Average F1-score : 0.958366079797707\n", "Unchanged Cases : Average Accuracy : 0.7338097547697623\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9935895575638033\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9935895575638033\n", "Rewritten Cases : Average F1-score : 0.9949832997161984\n", "######################################\n", "LLM used: Amazon-Nova-Pro-1.0\n", "Partial Cases : Average Accuracy : 0.9420819648630684\n", "Partial Cases : Average Precision : 0.9040544472180075\n", "Partial Cases : Average Recall : 0.9880641795487695\n", "Partial Cases : Average F1-score : 0.9350074552407406\n", "Unchanged Cases : Average Accuracy : 0.7036149176235987\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9767380020417522\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9767380020417522\n", "Rewritten Cases : Average F1-score : 0.9856868708986889\n", "######################################\n", "LLM used: Mistral-Large-2411\n", "Partial Cases : Average Accuracy : 0.9172132679223375\n", "Partial Cases : Average Precision : 0.8322192171807924\n", "Partial Cases : Average Recall : 0.9937128539395118\n", "Partial Cases : Average F1-score : 0.8904205703429668\n", "Unchanged Cases : Average Accuracy : 0.7260735050182588\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9840175620697493\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9840175620697493\n", "Rewritten Cases : Average F1-score : 0.9904918299507316\n", "######################################\n", "LLM used: Amazon-Nova-Lite-1.0\n", "Partial Cases : Average Accuracy : 0.9603160873335176\n", "Partial Cases : Average Precision : 0.9454284020250661\n", "Partial Cases : Average Recall : 0.9819026042891019\n", "Partial Cases : Average F1-score : 0.9552166724028223\n", "Unchanged Cases : Average Accuracy : 0.7169368001381942\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9749074133945931\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9749074133945931\n", "Rewritten Cases : Average F1-score : 0.9830424653518793\n", "######################################\n", "LLM used: Aya-23\n", "Partial Cases : Average Accuracy : 0.8939010080863429\n", "Partial Cases : Average Precision : 0.8580504952201568\n", "Partial Cases : Average Recall : 0.9480139536580692\n", "Partial Cases : Average F1-score : 0.8687834693576781\n", "Unchanged Cases : Average Accuracy : 0.703219539879595\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9703275108339092\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9703275108339092\n", "Rewritten Cases : Average F1-score : 0.9825254712486579\n", "######################################\n", "LLM used: Gemini-Pro-1.5\n", "Partial Cases : Average Accuracy : 0.9256735931506715\n", "Partial Cases : Average Precision : 0.906815730173755\n", "Partial Cases : Average Recall : 0.9671482068813485\n", "Partial Cases : Average F1-score : 0.9140821220243404\n", "Unchanged Cases : Average Accuracy : 0.7432031819180889\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9918782544046495\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9918782544046495\n", "Rewritten Cases : Average F1-score : 0.9943597718047811\n", "######################################\n", "LLM used: GPT-o1\n", "Partial Cases : Average Accuracy : 0.9845463192024544\n", "Partial Cases : Average Precision : 0.9754754345632192\n", "Partial Cases : Average Recall : 0.9976556015353567\n", "Partial Cases : Average F1-score : 0.9830703526784207\n", "Unchanged Cases : Average Accuracy : 0.7273890896344927\n", "Unchanged Cases : Average Precision : 0.0\n", "Unchanged Cases : Average Recall : 0.0\n", "Unchanged Cases : Average F1-score : 0.0\n", "Rewritten Cases : Average Accuracy : 0.9235436444490587\n", "Rewritten Cases : Average Precision : 1.0\n", "Rewritten Cases : Average Recall : 0.9235436444490587\n", "Rewritten Cases : Average F1-score : 0.9525632470544113\n", "######################################\n" ] } ] }, { "cell_type": "code", "source": [ "llm_values = df['LLM used'].unique()\n", "for llm in llm_values:\n", " print(\"LLM used:\", llm)\n", " df_llm = df[df['LLM used'] == llm]\n", " total_tp = df_llm['ROW_TP'].sum()\n", " total_fp = df_llm['ROW_FP'].sum()\n", " total_tn = df_llm['ROW_TN'].sum()\n", " total_fn = df_llm['ROW_FN'].sum()\n", " if (total_tp + total_tn + total_fp + total_fn) == 0:\n", " accuracy = 0\n", " else:\n", " accuracy = (total_tp + total_tn) / (total_tp + total_tn + total_fp + total_fn)\n", " if (total_tp + total_fp) == 0:\n", " precision = 0\n", " else:\n", " precision = total_tp / (total_tp + total_fp)\n", " if (total_tp + total_fn) == 0:\n", " recall = 0\n", " else:\n", " recall = total_tp / (total_tp + total_fn)\n", " if (precision + recall) == 0:\n", " f1_score = 0\n", " else:\n", " f1_score = 2 * (precision * recall) / (precision + recall)\n", " print(\"Overall Accuracy:\", accuracy)\n", " print(\"Overall Precision:\", precision)\n", " print(\"Overall Recall:\", recall)\n", " print(\"Overall F1-score:\", f1_score)\n", " print(\"######################################\")" ], "metadata": { "id": "02ubnS2dxq1x", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "6559d692-1b37-4959-9f02-8325051b6929" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "LLM used: GPT-4o\n", "Overall Accuracy: 0.8621635493977605\n", "Overall Precision: 0.6561905186439922\n", "Overall Recall: 0.9806999836779289\n", "Overall F1-score: 0.7862786545721127\n", "######################################\n", "LLM used: Claude-Sonnet-3.5\n", "Overall Accuracy: 0.8766248847475623\n", "Overall Precision: 0.6785433443271618\n", "Overall Recall: 0.9990938624281802\n", "Overall F1-score: 0.8081943914678859\n", "######################################\n", "LLM used: Claude-Haiku-3.5\n", "Overall Accuracy: 0.9038479903229913\n", "Overall Precision: 0.7351318709598514\n", "Overall Recall: 0.9836722308594495\n", "Overall F1-score: 0.8414324898544849\n", "######################################\n", "LLM used: Gemini-Flash-1.5\n", "Overall Accuracy: 0.9102265879188456\n", "Overall Precision: 0.8014504086240878\n", "Overall Recall: 0.9944363067135232\n", "Overall F1-score: 0.8875742301110061\n", "######################################\n", "LLM used: Amazon-Nova-Pro-1.0\n", "Overall Accuracy: 0.8766787803402658\n", "Overall Precision: 0.718533897949159\n", "Overall Recall: 0.986746912419284\n", "Overall F1-score: 0.8315476265949848\n", "######################################\n", "LLM used: Mistral-Large-2411\n", "Overall Accuracy: 0.8648027593055178\n", "Overall Precision: 0.6204490118740533\n", "Overall Recall: 0.9924342023664823\n", "Overall F1-score: 0.7635454504971595\n", "######################################\n", "LLM used: Amazon-Nova-Lite-1.0\n", "Overall Accuracy: 0.8923279449609348\n", "Overall Precision: 0.7773804087862541\n", "Overall Recall: 0.9814971224689317\n", "Overall F1-score: 0.8675949527229833\n", "######################################\n", "LLM used: Aya-23\n", "Overall Accuracy: 0.8838739210595417\n", "Overall Precision: 0.7876467096275847\n", "Overall Recall: 0.9701416014334568\n", "Overall F1-score: 0.8694207777279531\n", "######################################\n", "LLM used: Gemini-Pro-1.5\n", "Overall Accuracy: 0.8743041177486947\n", "Overall Precision: 0.7368664525250682\n", "Overall Recall: 0.9780702797482503\n", "Overall F1-score: 0.8405058493358241\n", "######################################\n", "LLM used: GPT-o1\n", "Overall Accuracy: 0.9194109616367335\n", "Overall Precision: 0.8506393321146081\n", "Overall Recall: 0.9876984255652262\n", "Overall F1-score: 0.9140595905660327\n", "######################################\n" ] } ] }, { "cell_type": "code", "source": [ "df.info()" ], "metadata": { "id": "mr8VT4S9whud", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "3413c78b-3f50-42a2-dd24-02dcccb80cc0" }, "execution_count": 31, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 49963 entries, 0 to 49962\n", "Data columns (total 29 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 ISO 49963 non-null object \n", " 1 LLM used 49963 non-null object \n", " 2 Type 49963 non-null object \n", " 3 Data Split 49963 non-null object \n", " 4 Original text 49963 non-null object \n", " 5 Original Word Count 49963 non-null int64 \n", " 6 Original Char Count 49963 non-null int64 \n", " 7 Split Location 49963 non-null int64 \n", " 8 Modified text 49963 non-null object \n", " 9 New Word Count 49963 non-null int64 \n", " 10 New Char Count 49963 non-null int64 \n", " 11 id 49963 non-null int64 \n", " 12 label_pred 49963 non-null int64 \n", " 13 text 49963 non-null object \n", " 14 label_gold 49963 non-null int64 \n", " 15 diff 49963 non-null int64 \n", " 16 Token Limit Check 49963 non-null object \n", " 17 WORDS_REAL 49963 non-null object \n", " 18 WORDS_PRED 49963 non-null object \n", " 19 ROW_TP 49963 non-null int64 \n", " 20 ROW_FP 49963 non-null int64 \n", " 21 ROW_TN 49963 non-null int64 \n", " 22 ROW_FN 49963 non-null int64 \n", " 23 ROW_ACC 49963 non-null float64\n", " 24 ROW_PREC 49963 non-null float64\n", " 25 ROW_REC 49963 non-null float64\n", " 26 ROW_F1 49963 non-null float64\n", " 27 Label : 1 49963 non-null float64\n", " 28 Label : 0 49963 non-null float64\n", "dtypes: float64(6), int64(13), object(10)\n", "memory usage: 11.1+ MB\n" ] } ] }, { "cell_type": "code", "source": [ "df.to_csv(\"VIE-INFERENCE-3.csv\")" ], "metadata": { "id": "AEEjGzfkw0le" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "source": [ "print(\"CLICK CTRL+S, WAIT 2 SEC FOR IT TO BE SAVED, DOWNLOAD BOTH CODE AND THE CSV FILE FROM RUNTIME\")" ], "metadata": { "id": "T6TKnuFjx9ZE", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0dd7c24f-a5e6-4b14-a520-64df9d21e256" }, "execution_count": 33, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "CLICK CTRL+S, WAIT 2 SEC FOR IT TO BE SAVED, DOWNLOAD BOTH CODE AND THE CSV FILE FROM RUNTIME\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "-4QvATtXJPCT" }, "execution_count": 33, "outputs": [] } ] }