add language model

Browse files

Files changed (5) hide show

alphabet.json +1 -0
build_lm_processor.ipynb +200 -0
language_model/attrs.json +1 -0
language_model/km_wiki_ngram.arpa +3 -0
language_model/unigrams.txt +0 -0

alphabet.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"labels": [" ", "\u1780", "\u1781", "\u1782", "\u1783", "\u1784", "\u1785", "\u1786", "\u1787", "\u1788", "\u1789", "\u178a", "\u178b", "\u178c", "\u178d", "\u178e", "\u178f", "\u1790", "\u1791", "\u1792", "\u1793", "\u1794", "\u1795", "\u1796", "\u1797", "\u1798", "\u1799", "\u179a", "\u179b", "\u179c", "\u179f", "\u17a0", "\u17a1", "\u17a2", "\u17a5", "\u17a7", "\u17aa", "\u17ab", "\u17ac", "\u17ad", "\u17ae", "\u17af", "\u17b1", "\u17b6", "\u17b7", "\u17b8", "\u17b9", "\u17ba", "\u17bb", "\u17bc", "\u17bd", "\u17be", "\u17bf", "\u17c0", "\u17c1", "\u17c2", "\u17c3", "\u17c4", "\u17c5", "\u17c6", "\u17c7", "\u17c8", "\u17c9", "\u17ca", "\u17cb", "\u17cc", "\u17cd", "\u17ce", "\u17cf", "\u17d0", "\u17d2", "\u2047", "", "<s>", "</s>"], "is_bpe": false}

build_lm_processor.ipynb ADDED Viewed

	@@ -0,0 +1,200 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5393aa33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCTC, Wav2Vec2Processor, AutoProcessor, Wav2Vec2ProcessorWithLM\n",
+    "from datasets import load_dataset, load_metric, Audio\n",
+    "from pyctcdecode import build_ctcdecoder\n",
+    "from pydub import AudioSegment\n",
+    "from pydub.playback import play\n",
+    "\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import kenlm\n",
+    "import pandas as pd\n",
+    "import random\n",
+    "import soundfile as sf\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2d34d3b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_text_word_unigram.arpa'\n",
+    "KENLM_MODEL_LOC = '/workspace/xls-r-300m-km/data/km_wiki_ngram.arpa'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f0354cb2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading the LM will be faster if you build a binary file.\n",
+      "Reading /workspace/xls-r-300m-km/vitouphy/xls-r-300m-km/language_model/km_text.arpa\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "Only 81 unigrams passed as vocabulary. Is this small or artificial data?\n",
+      "****************************************************************************************************\n"
+     ]
+    }
+   ],
+   "source": [
+    "processor = AutoProcessor.from_pretrained(\"vitouphy/xls-r-300m-km\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "109f28e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'|': 0, 'ក': 1, 'ខ': 2, 'គ': 3, 'ឃ': 4, 'ង': 5, 'ច': 6, 'ឆ': 7, 'ជ': 8, 'ឈ': 9, 'ញ': 10, 'ដ': 11, 'ឋ': 12, 'ឌ': 13, 'ឍ': 14, 'ណ': 15, 'ត': 16, 'ថ': 17, 'ទ': 18, 'ធ': 19, 'ន': 20, 'ប': 21, 'ផ': 22, 'ព': 23, 'ភ': 24, 'ម': 25, 'យ': 26, 'រ': 27, 'ល': 28, 'វ': 29, 'ស': 30, 'ហ': 31, 'ឡ': 32, 'អ': 33, 'ឥ': 34, 'ឧ': 35, 'ឪ': 36, 'ឫ': 37, 'ឬ': 38, 'ឭ': 39, 'ឮ': 40, 'ឯ': 41, 'ឱ': 42, 'ា': 43, 'ិ': 44, 'ី': 45, 'ឹ': 46, 'ឺ': 47, 'ុ': 48, 'ូ': 49, 'ួ': 50, 'ើ': 51, 'ឿ': 52, 'ៀ': 53, 'េ': 54, 'ែ': 55, 'ៃ': 56, 'ោ': 57, 'ៅ': 58, 'ំ': 59, 'ះ': 60, 'ៈ': 61, '៉': 62, '៊': 63, '់': 64, '៌': 65, '៍': 66, '៎': 67, '៏': 68, '័': 69, '្': 70, '[unk]': 71, '[pad]': 72, '<s>': 73, '</s>': 74}\n"
+     ]
+    }
+   ],
+   "source": [
+    "vocab_dict = processor.tokenizer.get_vocab()\n",
+    "sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}\n",
+    "print(sorted_vocab_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "300cec39",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loading the LM will be faster if you build a binary file.\n",
+      "Reading /workspace/xls-r-300m-km/data/km_wiki_ngram.arpa\n",
+      "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
+      "Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?\n",
+      "****************************************************************************************************\n"
+     ]
+    }
+   ],
+   "source": [
+    "decoder = build_ctcdecoder(\n",
+    "    labels=list(sorted_vocab_dict.keys()),\n",
+    "    kenlm_model_path=KENLM_MODEL_LOC,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "27dd8427",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_with_lm = Wav2Vec2ProcessorWithLM(\n",
+    "    feature_extractor=processor.feature_extractor,\n",
+    "    tokenizer=processor.tokenizer,\n",
+    "    decoder=decoder\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "94eb248e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor_with_lm.save_pretrained(\".\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f9b3dcc",
+   "metadata": {},
+   "source": [
+    "## Save Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "8b584690",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "bc5bf68946064e97b869d44b02e7af19",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = AutoModelForCTC.from_pretrained(\"vitouphy/xls-r-300m-km\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3712c030",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save_pretrained('.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5d8de20",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

language_model/attrs.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"alpha": 0.5, "beta": 1.5, "unk_score_offset": -10.0, "score_boundary": true}

language_model/km_wiki_ngram.arpa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4eae7d94d04e95668df7306edf35e21f4bbab2a73c736b921e531cd25cde6d0
+size 109085039

language_model/unigrams.txt ADDED Viewed

The diff for this file is too large to render. See raw diff