{ "cells": [ { "cell_type": "markdown", "metadata": { "heading_collapsed": true, "id": "62A6F_072Fwq" }, "source": [ "## Install Dependencies" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "hidden": true, "id": "5w5AkskZ2Fwr" }, "outputs": [], "source": [ "#@title Install and Import Dependencies\n", "\n", "# this assumes that you have a relevant version of PyTorch installed\n", "!pip install -q torchaudio\n", "\n", "SAMPLING_RATE = 16000\n", "\n", "import torch\n", "torch.set_num_threads(1)\n", "\n", "from IPython.display import Audio\n", "from pprint import pprint\n", "# download example\n", "torch.hub.download_url_to_file('https://models.silero.ai/vad_models/en.wav', 'en_example.wav')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pSifus5IilRp" }, "outputs": [], "source": [ "USE_ONNX = False # change this to True if you want to test onnx model\n", "if USE_ONNX:\n", " !pip install -q onnxruntime\n", " \n", "model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',\n", " model='silero_vad',\n", " force_reload=True,\n", " onnx=USE_ONNX)\n", "\n", "(get_speech_timestamps,\n", " save_audio,\n", " read_audio,\n", " VADIterator,\n", " collect_chunks) = utils" ] }, { "cell_type": "markdown", "metadata": { "id": "fXbbaUO3jsrw" }, "source": [ "## Full Audio" ] }, { "cell_type": "markdown", "metadata": { "id": "RAfJPb_a-Auj" }, "source": [ "**Speech timestapms from full audio**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "aI_eydBPjsrx" }, "outputs": [], "source": [ "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n", "# get speech timestamps from full audio file\n", "speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=SAMPLING_RATE)\n", "pprint(speech_timestamps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "OuEobLchjsry" }, "outputs": [], "source": [ "# merge all speech chunks to one audio\n", "save_audio('only_speech.wav',\n", " collect_chunks(speech_timestamps, wav), sampling_rate=SAMPLING_RATE) \n", "Audio('only_speech.wav')" ] }, { "cell_type": "markdown", "metadata": { "id": "iDKQbVr8jsry" }, "source": [ "## Stream imitation example" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "q-lql_2Wjsry" }, "outputs": [], "source": [ "## using VADIterator class\n", "\n", "vad_iterator = VADIterator(model)\n", "wav = read_audio(f'en_example.wav', sampling_rate=SAMPLING_RATE)\n", "\n", "window_size_samples = 1536 # number of samples in a single audio chunk\n", "for i in range(0, len(wav), window_size_samples):\n", " chunk = wav[i: i+ window_size_samples]\n", " if len(chunk) < window_size_samples:\n", " break\n", " speech_dict = vad_iterator(chunk, return_seconds=True)\n", " if speech_dict:\n", " print(speech_dict, end=' ')\n", "vad_iterator.reset_states() # reset model states after each audio" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "BX3UgwwB2Fwv" }, "outputs": [], "source": [ "## just probabilities\n", "\n", "wav = read_audio('en_example.wav', sampling_rate=SAMPLING_RATE)\n", "speech_probs = []\n", "window_size_samples = 1536\n", "for i in range(0, len(wav), window_size_samples):\n", " chunk = wav[i: i+ window_size_samples]\n", " if len(chunk) < window_size_samples:\n", " break\n", " speech_prob = model(chunk, SAMPLING_RATE).item()\n", " speech_probs.append(speech_prob)\n", "vad_iterator.reset_states() # reset model states after each audio\n", "\n", "print(speech_probs[:10]) # first 10 chunks predicts" ] } ], "metadata": { "colab": { "name": "silero-vad.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }