KPI-AI-team
/

UzbekSpellchecker

Model card Files Files and versions

xet

Community

KPI-AI-team commited on Feb 6, 2024

Commit

a50f7e0

verified ·

1 Parent(s): aa0e124

Upload 2 files

Browse files

uploaded data processing files

Files changed (2) hide show

data_preprocessing.ipynb +309 -0
environment.yml +0 -0

data_preprocessing.ipynb ADDED Viewed

	@@ -0,0 +1,309 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import numpy as np \n",
+    "import nltk\n",
+    "import os\n",
+    "import sklearn\n",
+    "import parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data1 = pd.read_parquet(\"news-00000-of-00007-0ff1ec222cd690f2.parquet\")\n",
+    "data2 = pd.read_parquet(\"news-00001-of-00007-7c273f5de9017dc5.parquet\")\n",
+    "data3 = pd.read_parquet(\"telegram_blogs-00000-of-00001-80087cf60adbe6d4.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract the 'text' column from each DataFrame\n",
+    "texts1 = data1['text']\n",
+    "texts2 = data2['text']\n",
+    "texts3 = data3['text']\n",
+    "\n",
+    "# Concatenate the 'text' columns from all three DataFrames\n",
+    "all_texts = pd.concat([texts1, texts2, texts3], ignore_index=True)\n",
+    "data = pd.DataFrame(all_texts, columns=['text'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "count    643523.000000\n",
+      "mean       1225.167071\n",
+      "std        2613.174490\n",
+      "min           0.000000\n",
+      "25%         271.000000\n",
+      "50%         689.000000\n",
+      "75%        1362.000000\n",
+      "max      299171.000000\n",
+      "Name: text, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Calculate the length of each text entry\n",
+    "text_lengths = data['text'].str.len()\n",
+    "\n",
+    "# Display the distribution of text lengths\n",
+    "length_distribution = text_lengths.describe()\n",
+    "\n",
+    "# Print the distribution\n",
+    "print(length_distribution)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import string\n",
+    "\n",
+    "def random_letter():\n",
+    "    \"\"\"Returns a random alphanumeric character.\"\"\"\n",
+    "    return random.choice(string.ascii_letters + string.digits)\n",
+    "\n",
+    "def replace_random_letters(word, pct=0.15):\n",
+    "    \"\"\"Replaces random letters in a word with a given probability.\"\"\"\n",
+    "    if random.random() < pct:\n",
+    "        char_pos = random.choice(range(len(word)))\n",
+    "        return word[:char_pos] + random_letter() + word[char_pos + 1:]\n",
+    "    else:\n",
+    "        return word\n",
+    "\n",
+    "def misspell_text(text, pct=0.15, last_letter_error_pct=0.20):\n",
+    "    \"\"\"Generates a misspelled version of the input text.\"\"\"\n",
+    "    words = text.split()\n",
+    "    misspelled_words = [replace_random_letters(word, pct) for word in words]\n",
+    "    \n",
+    "    # Apply last letter error with a different probability\n",
+    "    for i, word in enumerate(misspelled_words):\n",
+    "        if random.random() < last_letter_error_pct:\n",
+    "            if len(word) > 1:  # Ensure word has more than 1 character\n",
+    "                misspelled_words[i] = word[:-1] + random_letter()\n",
+    "    \n",
+    "    return ' '.join(misspelled_words)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.rename(columns={'text':'ground_truth'}, inplace=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'Index' object has no attribute '_format_flat'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/IPython/core/formatters.py:343\u001b[0m, in \u001b[0;36mBaseFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m    341\u001b[0m     method \u001b[38;5;241m=\u001b[39m get_real_method(obj, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_method)\n\u001b[1;32m    342\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m method \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 343\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    344\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    345\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py:1053\u001b[0m, in \u001b[0;36m_repr_html_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1036\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m   1037\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mshape\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mtuple\u001b[39m[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mint\u001b[39m]:\n\u001b[1;32m   1038\u001b[0m     \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1039\u001b[0m \u001b[38;5;124;03m    Return a tuple representing the dimensionality of the DataFrame.\u001b[39;00m\n\u001b[1;32m   1040\u001b[0m \n\u001b[1;32m   1041\u001b[0m \u001b[38;5;124;03m    See Also\u001b[39;00m\n\u001b[1;32m   1042\u001b[0m \u001b[38;5;124;03m    --------\u001b[39;00m\n\u001b[1;32m   1043\u001b[0m \u001b[38;5;124;03m    ndarray.shape : Tuple of array dimensions.\u001b[39;00m\n\u001b[1;32m   1044\u001b[0m \n\u001b[1;32m   1045\u001b[0m \u001b[38;5;124;03m    Examples\u001b[39;00m\n\u001b[1;32m   1046\u001b[0m \u001b[38;5;124;03m    --------\u001b[39;00m\n\u001b[1;32m   1047\u001b[0m \u001b[38;5;124;03m    >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})\u001b[39;00m\n\u001b[1;32m   1048\u001b[0m \u001b[38;5;124;03m    >>> df.shape\u001b[39;00m\n\u001b[1;32m   1049\u001b[0m \u001b[38;5;124;03m    (2, 2)\u001b[39;00m\n\u001b[1;32m   1050\u001b[0m \n\u001b[1;32m   1051\u001b[0m \u001b[38;5;124;03m    >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],\u001b[39;00m\n\u001b[1;32m   1052\u001b[0m \u001b[38;5;124;03m    ...                    'col3': [5, 6]})\u001b[39;00m\n\u001b[0;32m-> 1053\u001b[0m \u001b[38;5;124;03m    >>> df.shape\u001b[39;00m\n\u001b[1;32m   1054\u001b[0m \u001b[38;5;124;03m    (2, 3)\u001b[39;00m\n\u001b[1;32m   1055\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m   1056\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindex), \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/format.py:1102\u001b[0m, in \u001b[0;36mto_html\u001b[0;34m(self, buf, encoding, classes, notebook, border, table_id, render_links)\u001b[0m\n\u001b[1;32m   1079\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mformat_array\u001b[39m(\n\u001b[1;32m   1080\u001b[0m     values: ArrayLike,\n\u001b[1;32m   1081\u001b[0m     formatter: Callable \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   1090\u001b[0m     fallback_formatter: Callable \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m   1091\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[1;32m   1092\u001b[0m     \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m   1093\u001b[0m \u001b[38;5;124;03m    Format an array for printing.\u001b[39;00m\n\u001b[1;32m   1094\u001b[0m \n\u001b[1;32m   1095\u001b[0m \u001b[38;5;124;03m    Parameters\u001b[39;00m\n\u001b[1;32m   1096\u001b[0m \u001b[38;5;124;03m    ----------\u001b[39;00m\n\u001b[1;32m   1097\u001b[0m \u001b[38;5;124;03m    values : np.ndarray or ExtensionArray\u001b[39;00m\n\u001b[1;32m   1098\u001b[0m \u001b[38;5;124;03m    formatter\u001b[39;00m\n\u001b[1;32m   1099\u001b[0m \u001b[38;5;124;03m    float_format\u001b[39;00m\n\u001b[1;32m   1100\u001b[0m \u001b[38;5;124;03m    na_rep\u001b[39;00m\n\u001b[1;32m   1101\u001b[0m \u001b[38;5;124;03m    digits\u001b[39;00m\n\u001b[0;32m-> 1102\u001b[0m \u001b[38;5;124;03m    space\u001b[39;00m\n\u001b[1;32m   1103\u001b[0m \u001b[38;5;124;03m    justify\u001b[39;00m\n\u001b[1;32m   1104\u001b[0m \u001b[38;5;124;03m    decimal\u001b[39;00m\n\u001b[1;32m   1105\u001b[0m \u001b[38;5;124;03m    leading_space : bool, optional, default True\u001b[39;00m\n\u001b[1;32m   1106\u001b[0m \u001b[38;5;124;03m        Whether the array should be formatted with a leading space.\u001b[39;00m\n\u001b[1;32m   1107\u001b[0m \u001b[38;5;124;03m        When an array as a column of a Series or DataFrame, we do want\u001b[39;00m\n\u001b[1;32m   1108\u001b[0m \u001b[38;5;124;03m        the leading space to pad between columns.\u001b[39;00m\n\u001b[1;32m   1109\u001b[0m \n\u001b[1;32m   1110\u001b[0m \u001b[38;5;124;03m        When formatting an Index subclass\u001b[39;00m\n\u001b[1;32m   1111\u001b[0m \u001b[38;5;124;03m        (e.g. IntervalIndex._get_values_for_csv), we don't want the\u001b[39;00m\n\u001b[1;32m   1112\u001b[0m \u001b[38;5;124;03m        leading space since it should be left-aligned.\u001b[39;00m\n\u001b[1;32m   1113\u001b[0m \u001b[38;5;124;03m    fallback_formatter\u001b[39;00m\n\u001b[1;32m   1114\u001b[0m \n\u001b[1;32m   1115\u001b[0m \u001b[38;5;124;03m    Returns\u001b[39;00m\n\u001b[1;32m   1116\u001b[0m \u001b[38;5;124;03m    -------\u001b[39;00m\n\u001b[1;32m   1117\u001b[0m \u001b[38;5;124;03m    List[str]\u001b[39;00m\n\u001b[1;32m   1118\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[1;32m   1119\u001b[0m     fmt_klass: \u001b[38;5;28mtype\u001b[39m[_GenericArrayFormatter]\n\u001b[1;32m   1120\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mis_np_dtype(values\u001b[38;5;241m.\u001b[39mdtype, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mM\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:88\u001b[0m, in \u001b[0;36mHTMLFormatter.to_string\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mto_string\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[0;32m---> 88\u001b[0m     lines \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     89\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m lines):\n\u001b[1;32m     90\u001b[0m         lines \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mstr\u001b[39m(x) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m lines]\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:644\u001b[0m, in \u001b[0;36mNotebookFormatter.render\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    642\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<div>\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    643\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite_style()\n\u001b[0;32m--> 644\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrender\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    645\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m</div>\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    646\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39melements\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:94\u001b[0m, in \u001b[0;36mHTMLFormatter.render\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrender\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m---> 94\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_write_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshould_show_dimensions:\n\u001b[1;32m     97\u001b[0m         by \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mchr\u001b[39m(\u001b[38;5;241m215\u001b[39m)  \u001b[38;5;66;03m# ×  # noqa: RUF003\u001b[39;00m\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:267\u001b[0m, in \u001b[0;36mHTMLFormatter._write_table\u001b[0;34m(self, indent)\u001b[0m\n\u001b[1;32m    261\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite(\n\u001b[1;32m    262\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m<table\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mborder_attr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m class=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(_classes)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mid_section\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m>\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[1;32m    263\u001b[0m     indent,\n\u001b[1;32m    264\u001b[0m )\n\u001b[1;32m    266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfmt\u001b[38;5;241m.\u001b[39mheader \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshow_row_idx_names:\n\u001b[0;32m--> 267\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_write_header\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindent_delta\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    269\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_write_body(indent \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindent_delta)\n\u001b[1;32m    271\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m</table>\u001b[39m\u001b[38;5;124m\"\u001b[39m, indent)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:403\u001b[0m, in \u001b[0;36mHTMLFormatter._write_header\u001b[0;34m(self, indent)\u001b[0m\n\u001b[1;32m    400\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mwrite(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<thead>\u001b[39m\u001b[38;5;124m\"\u001b[39m, indent)\n\u001b[1;32m    402\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfmt\u001b[38;5;241m.\u001b[39mheader:\n\u001b[0;32m--> 403\u001b[0m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_write_col_header\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindent\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindent_delta\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    405\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshow_row_idx_names:\n\u001b[1;32m    406\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_write_row_header(indent \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mindent_delta)\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:383\u001b[0m, in \u001b[0;36mHTMLFormatter._write_col_header\u001b[0;34m(self, indent)\u001b[0m\n\u001b[1;32m    381\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    382\u001b[0m         row\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m--> 383\u001b[0m row\u001b[38;5;241m.\u001b[39mextend(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_columns_formatted_values\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m    384\u001b[0m align \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfmt\u001b[38;5;241m.\u001b[39mjustify\n\u001b[1;32m    386\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_truncated_horizontally:\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.9/site-packages/pandas/io/formats/html.py:611\u001b[0m, in \u001b[0;36mNotebookFormatter._get_columns_formatted_values\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    609\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_columns_formatted_values\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[1;32m    610\u001b[0m     \u001b[38;5;66;03m# only reached with non-Multi Index\u001b[39;00m\n\u001b[0;32m--> 611\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_format_flat\u001b[49m(include_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'Index' object has no attribute '_format_flat'"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "                                             ground_truth\n",
+       "0       «Toshshahartransxizmat» AJ Axborot xizmati jam...\n",
+       "1       Oʻzbekiston Respublikasi Prezidenti Shavkat Mi...\n",
+       "2       Oʻzbekistonning AQSHdagi elchisi Javlon Vahobo...\n",
+       "3       Oliy Majlisning Inson huquqlari boʻyicha vakil...\n",
+       "4       Bu haqda Agentlik axborot xizmati xabar berdi....\n",
+       "...                                                   ...\n",
+       "643518  Марказий Осиё давлатлари бўйлаб эркин ҳаракатл...\n",
+       "643519  Олий таълим муассасаларида \\nКоррупцияга қар...\n",
+       "643520  Қирғизистонда вазир лавозимига тайинланганид...\n",
+       "643521  Исроил ва Фаластин ўқ отишни тўхтатишди. \"Масж...\n",
+       "643522  АҚШ 20 та давлатга ҳарбий иншоотлар учун 240...\n",
+       "\n",
+       "[643523 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply the function to create misspelled texts\n",
+    "data['sample_misspelled'] = data['ground_truth'].apply(lambda x: misspell_text(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                             ground_truth  \\\n",
+      "0       «Toshshahartransxizmat» AJ Axborot xizmati jam...   \n",
+      "1       Oʻzbekiston Respublikasi Prezidenti Shavkat Mi...   \n",
+      "2       Oʻzbekistonning AQSHdagi elchisi Javlon Vahobo...   \n",
+      "3       Oliy Majlisning Inson huquqlari boʻyicha vakil...   \n",
+      "4       Bu haqda Agentlik axborot xizmati xabar berdi....   \n",
+      "...                                                   ...   \n",
+      "643518  Марказий Осиё давлатлари бўйлаб эркин ҳаракатл...   \n",
+      "643519  Олий таълим муассасаларида \\nКоррупцияга қар...   \n",
+      "643520  Қирғизистонда вазир лавозимига тайинланганид...   \n",
+      "643521  Исроил ва Фаластин ўқ отишни тўхтатишди. \"Масж...   \n",
+      "643522  АҚШ 20 та давлатга ҳарбий иншоотлар учун 240...   \n",
+      "\n",
+      "                                        sample_misspelled  \n",
+      "0       «Toshshahartransxizmat» A6 Axborot xizmatG jam...  \n",
+      "1       Oʻzbekiston Respublikasi Prezidenti Shavkat Mi...  \n",
+      "2       Oʻzbekistonnin6 AQSHdagi elchisi Javlog Vahobo...  \n",
+      "3       Oliy Majlisning Inson huquqlar2 boʻyicha Makil...  \n",
+      "4       Bu haqda AgentliU axlorot xizmal9 xabar berdi....  \n",
+      "...                                                   ...  \n",
+      "643518  Марказиc ОсGё давлатлаIи бўйлаб эркин ҳаракатл...  \n",
+      "643519  yОлиe таъcиQ муассасаларида КоррупциягX қарш4...  \n",
+      "643520  Қирғизистонда вазиP лавозимига тайинланганид...  \n",
+      "643521  Исроил tf Фаластиt ўқ отишни тўхтатиPдиA \"Масж...  \n",
+      "643522  АҚШ 20 тB давлатга ҳарбиn иншоотлар учун 240...  \n",
+      "\n",
+      "[643523 rows x 2 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expanded_data = pd.concat([data['ground_truth'], data['ground_truth']], ignore_index=True).to_frame(name='ground_truth')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "\n",
+    "def creative_misspell(text):\n",
+    "    \"\"\"Apply creative misspelling strategies to the text.\"\"\"\n",
+    "    words = text.split()\n",
+    "    misspelled_words = []\n",
+    "    for word in words:\n",
+    "        # Example of a simple typographical error\n",
+    "        if random.random() < 0.05:  # Apply with 5% probability\n",
+    "            word = word.replace('the', 'teh')\n",
+    "        \n",
+    "        # Example of omitting letters\n",
+    "        if random.random() < 0.05 and len(word) > 3:\n",
+    "            omit_index = random.randint(1, len(word) - 2)  # Avoid first and last character\n",
+    "            word = word[:omit_index] + word[omit_index + 1:]\n",
+    "        \n",
+    "        # Add more rules as needed\n",
+    "        \n",
+    "        misspelled_words.append(word)\n",
+    "    \n",
+    "    return ' '.join(misspelled_words)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expanded_data['sample_misspelled'] = expanded_data['ground_truth'].apply(creative_misspell)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

environment.yml ADDED Viewed

File without changes