{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"2022-01-03-asr.ipynb","provenance":[{"file_id":"https://github.com/recohut/nbs/blob/main/raw/3e4f7482-f98c-4da9-9881-d672d3593be9.ipynb","timestamp":1644426823870}],"collapsed_sections":[],"toc_visible":true,"authorship_tag":"ABX9TyPC88O3N7FLUeOQERyzB3KK"},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["# Automatic Speech Recognition"],"metadata":{"id":"XYagIcaoeUGQ"}},{"cell_type":"markdown","metadata":{"id":"qniRUjlo7GOi"},"source":["### Loading Libraries"]},{"cell_type":"code","metadata":{"id":"FMeQoLFi32xL"},"source":["import os\n","import sys\n","from os.path import exists, join, basename, splitext\n","from IPython.display import YouTubeVideo"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"eNlh7bZJ-3BY"},"source":["!pip install -q youtube-dl librosa python_speech_features sentencepiece\n","!pip install jiwer\n","import jiwer"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2a2EdR8z86JQ"},"source":["### Loading Data"]},{"cell_type":"code","metadata":{"id":"xUNTuB-U4nfT","outputId":"a826573e-a53d-437a-dd1a-eec1d0fead65","executionInfo":{"status":"ok","timestamp":1587648176572,"user_tz":-330,"elapsed":20969,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"colab":{"base_uri":"https://localhost:8080/","height":321}},"source":["# Get the YouTube Video\n","YOUTUBE_ID = 'mi8N5gDVpeg'\n","YouTubeVideo(YOUTUBE_ID)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["\n"," \n"," "],"text/plain":[""],"image/jpeg":"\n"},"metadata":{"tags":[]},"execution_count":3}]},{"cell_type":"code","metadata":{"id":"IBRMsz3m5L6G","outputId":"429dc338-4b0c-40de-da41-1cee821d2ec2","executionInfo":{"status":"ok","timestamp":1587648193981,"user_tz":-330,"elapsed":38355,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"colab":{"base_uri":"https://localhost:8080/","height":102}},"source":["# Download, extract audio and convert to wav\n","!rm -rf *.wav\n","!youtube-dl --extract-audio --audio-format wav --output \"test.%(ext)s\" https://www.youtube.com/watch\\?v\\={YOUTUBE_ID}"],"execution_count":null,"outputs":[{"output_type":"stream","text":["[youtube] mi8N5gDVpeg: Downloading webpage\n","[download] Destination: test.webm\n","\u001b[K[download] 100% of 4.68MiB in 00:00\n","[ffmpeg] Destination: test.wav\n","Deleting original file test.webm (pass -k to keep)\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"SGBqamRPD5GT","outputId":"f9cf9c84-4bbd-4007-b08d-dea6c1285730","executionInfo":{"status":"ok","timestamp":1587648198667,"user_tz":-330,"elapsed":43008,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"colab":{"base_uri":"https://localhost:8080/","height":496}},"source":["!ffmpeg -i test.wav -ss 00:00:11 -to 00:01:11 -c copy test2.wav"],"execution_count":null,"outputs":[{"output_type":"stream","text":["ffmpeg version 3.4.6-0ubuntu0.18.04.1 Copyright (c) 2000-2019 the FFmpeg developers\n"," built with gcc 7 (Ubuntu 7.3.0-16ubuntu3)\n"," configuration: --prefix=/usr --extra-version=0ubuntu0.18.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-omx --enable-openal --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-chromaprint --enable-frei0r --enable-libopencv --enable-libx264 --enable-shared\n"," libavutil 55. 78.100 / 55. 78.100\n"," libavcodec 57.107.100 / 57.107.100\n"," libavformat 57. 83.100 / 57. 83.100\n"," libavdevice 57. 10.100 / 57. 10.100\n"," libavfilter 6.107.100 / 6.107.100\n"," libavresample 3. 7. 0 / 3. 7. 0\n"," libswscale 4. 8.100 / 4. 8.100\n"," libswresample 2. 9.100 / 2. 9.100\n"," libpostproc 54. 7.100 / 54. 7.100\n","\u001b[0;33mGuessed Channel Layout for Input Stream #0.0 : stereo\n","\u001b[0mInput #0, wav, from 'test.wav':\n"," Metadata:\n"," encoder : Lavf57.83.100\n"," Duration: 00:05:03.03, bitrate: 1536 kb/s\n"," Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s\n","Output #0, wav, to 'test2.wav':\n"," Metadata:\n"," ISFT : Lavf57.83.100\n"," Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, stereo, s16, 1536 kb/s\n","Stream mapping:\n"," Stream #0:0 -> #0:0 (copy)\n","Press [q] to stop, [?] for help\n","size= 11252kB time=00:00:59.99 bitrate=1536.4kbits/s speed=2.84e+03x \n","video:0kB audio:11252kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000677%\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"X4btkrmIEKRx","colab":{"base_uri":"https://localhost:8080/","height":75},"outputId":"041875c2-b397-4b78-8776-ce74eae330cf","executionInfo":{"status":"ok","timestamp":1587648254305,"user_tz":-330,"elapsed":98629,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}}},"source":["import IPython\n","IPython.display.Audio(\"test2.wav\")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["\n"," \n"," "],"text/plain":[""]},"metadata":{"tags":[]},"execution_count":6}]},{"cell_type":"code","metadata":{"id":"Vyv8G0cg_Vm_"},"source":["ground_truth = '''we didn't start with much money or many endorsements. \n","our campaign was not hatched in the halls of washington.\n","it began in the backyards of des moines.\n","and the living rooms of concord and the front porches of charleston.\n","it was built by working men and women who dug into what little savings they had\n","to give five dollars and ten dollars and twenty dollars to the cause.\n","it grew strength from the young people who rejected the myth of their generations' \n","apathy. who left their homes and their families for jobs that offered little pay\n","and less sleep. it drew strength from the not so young people who braved the bitter\n","cold and scorching heat to knock on doors of perfect strangers. and from the\n","millions of americans who volunteered and organized and proved that more than \n","two centuries later a government of the people by the people and for the people\n","has not perished from the earth.'''"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"3o51RO419EDS"},"source":["### DeepSpeech"]},{"cell_type":"code","metadata":{"id":"x8obp0wD4iIT"},"source":["# Install DeepSpeech\n","if not exists('deepspeech-0.6.1-models'):\n"," !apt-get install -qq sox\n"," !pip install -q deepspeech-gpu==0.6.1 youtube-dl\n"," !wget https://github.com/mozilla/DeepSpeech/releases/download/v0.6.1/deepspeech-0.6.1-models.tar.gz\n"," !tar xvfz deepspeech-0.6.1-models.tar.gz"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"lMBdIXIs5cJM","outputId":"94dc9249-0aea-4c19-aa9a-4961d52a47bd","executionInfo":{"status":"ok","timestamp":1587646360465,"user_tz":-330,"elapsed":31290,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"colab":{"base_uri":"https://localhost:8080/","height":513}},"source":["!deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio test2.wav"],"execution_count":null,"outputs":[{"output_type":"stream","text":["Loading model from file deepspeech-0.6.1-models/output_graph.pbmm\n","TensorFlow: v1.14.0-21-ge77504a\n","DeepSpeech: v0.6.1-0-g3df20fe\n","2020-04-23 12:52:17.666078: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA\n","2020-04-23 12:52:17.670574: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuda.so.1\n","2020-04-23 12:52:17.705373: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n","2020-04-23 12:52:17.705965: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: \n","name: Tesla T4 major: 7 minor: 5 memoryClockRate(GHz): 1.59\n","pciBusID: 0000:00:04.0\n","2020-04-23 12:52:17.705984: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.\n","2020-04-23 12:52:17.706033: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n","2020-04-23 12:52:17.706529: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n","2020-04-23 12:52:17.707017: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0\n","2020-04-23 12:52:18.069651: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:\n","2020-04-23 12:52:18.069691: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187] 0 \n","2020-04-23 12:52:18.069700: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0: N \n","2020-04-23 12:52:18.069860: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n","2020-04-23 12:52:18.070397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n","2020-04-23 12:52:18.070929: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero\n","2020-04-23 12:52:18.071437: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:40] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.\n","2020-04-23 12:52:18.071478: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14192 MB memory) -> physical GPU (device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5)\n","Loaded model in 0.416s.\n","Loading language model from files deepspeech-0.6.1-models/lm.binary deepspeech-0.6.1-models/trie\n","Loaded language model in 0.000291s.\n","Warning: original sample rate (48000) is different than 16000hz. Resampling might produce erratic speech recognition.\n","Running inference.\n","we don't say but much money or many norse our campaign was not had in the hall at washington and began in the back yards of the boy the livin rooms have continent porch is of charles i was bellamy wortman and women who dug and to what little savings had to give five dollars ten dollars and twenty dollars sadako it grew strained from the young people over jeered at their generation at the letter home and their families for jobs that offer little pay and less play iteratum the nut so young people who breathe better cold and scorching heat on not on door the furbishing and from the millions of americans a volunteer and organize and from bear than two centuries later a government of the people by the people and for the people had not paris from \n","Inference took 13.224s for 180.032s audio file.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"dQAbsiV4ACPa"},"source":["transcription_deepspeech = '''we don't say but much money or many norse our \n","campaign was not had in the hall at washington and began in the back yards of \n","the boy the livin rooms have continent porch is of charles i was bellamy \n","wortman and women who dug and to what little savings had to give five dollars \n","ten dollars and twenty dollars sadako it grew strained from the young people \n","over jeered at their generation at the letter home and their families for jobs \n","that offer little pay and less play iteratum the nut so young people who breathe \n","better cold and scorching heat on not on door the furbishing and from the millions \n","of americans a volunteer and organize and from bear than two centuries later a \n","government of the people by the people and for the people had not paris from '''"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Ky6CV3mMA8Uv","colab":{"base_uri":"https://localhost:8080/","height":68},"outputId":"d581eecc-f63d-4bab-a4e3-459f0318d903","executionInfo":{"status":"ok","timestamp":1587646501601,"user_tz":-330,"elapsed":970,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}}},"source":["measures = jiwer.compute_measures(ground_truth, transcription_deepspeech)\n","print(measures['wer'])\n","print(measures['mer'])\n","print(measures['wil'])"],"execution_count":null,"outputs":[{"output_type":"stream","text":["35.0\n","0.9929078014184397\n","0.99822695035461\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"Ba7ppx7n9xwI"},"source":["### Wave2Letter+"]},{"cell_type":"code","metadata":{"id":"APbDUwPC5tkg","outputId":"b3c93e50-244a-4612-e314-e008ba95fbb5","executionInfo":{"status":"ok","timestamp":1587647136363,"user_tz":-330,"elapsed":4821,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["# Install Wave2Letter+\n","%tensorflow_version 1.x\n","git_repo_url = 'https://github.com/NVIDIA/OpenSeq2Seq.git'\n","project_name = splitext(basename(git_repo_url))[0]\n","if not exists(project_name):\n"," # clone and install dependencies\n"," !git clone -q --depth 1 {git_repo_url}\n"," !git checkout e958b7d\n"," !pip uninstall -y -q pymc3\n"," !pip install --upgrade joblib\n"," #!cd {project_name} && pip install -q -r requirements.txt\n"," !pip install -q youtube-dl librosa python_speech_features sentencepiece\n"," \n"," # create eval config\n"," !cp {project_name}/example_configs/speech2text/w2lplus_large_8gpus_mp.py {project_name}/conf.py\n"," !sed -i -e 's/data\\/librispeech\\/librivox-test-clean/test/' {project_name}/conf.py\n"," !sed -i -e 's/# \"use_lang/\"use_lang/' {project_name}/conf.py\n"," !echo 'backend = \"librosa\"' >> {project_name}/conf.py \n"," #!cat {project_name}/conf.py\n"," !echo \"wav_filename, wav_filesize, transcript\" > {project_name}/test.csv\n"," !echo \"test.wav, UNUSED, UNUSED\" >> {project_name}/test.csv\n","\n","sys.path.append(project_name)\n","\n","# Download pre-trained weights\n","def download_from_google_drive(file_id, file_name):\n"," # download a file from the Google Drive link\n"," !rm -f ./cookie\n"," !curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id={file_id}\" > /dev/null\n"," confirm_text = !awk '/download/ {print $NF}' ./cookie\n"," confirm_text = confirm_text[0]\n"," !curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm={confirm_text}&id={file_id}\" -o {file_name}\n"," \n","if not exists(join(project_name, 'w2l_log_folder')):\n"," download_from_google_drive('10EYe040qVW6cfygSZz6HwGQDylahQNSa', 'w2l_plus_large.tar')\n"," !tar xf w2l_plus_large.tar\n"," !mv w2l_plus_large {project_name}/w2l_log_folder"],"execution_count":null,"outputs":[{"output_type":"stream","text":["TensorFlow 1.x selected.\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"0Ls-M0Ge5cHi"},"source":["# !ffmpeg -loglevel panic -y -i downloaded.wav -acodec pcm_s16le -ac 1 -ar 16000 $project_name/test.wav\n","!ffmpeg -i test2.wav -ar 16000 $project_name/test.wav\n","!cd {project_name} && python run.py --config_file conf.py --mode=infer --infer_output_file=output.txt --use_horovod=False --num_gpus=1 --batch_size_per_gpu 1"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"gCEgHbEgMM96","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"6a2f8a55-1184-4c45-fd8d-714a0a380a14","executionInfo":{"status":"ok","timestamp":1587647988265,"user_tz":-330,"elapsed":5790,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}}},"source":["!tail -n1 {project_name}/output.txt"],"execution_count":null,"outputs":[{"output_type":"stream","text":["tail: cannot open 'OpenSeq2Seq/output.txt' for reading: No such file or directory\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"gNNhhHgQMsfy"},"source":["### Jasper"]},{"cell_type":"code","metadata":{"id":"uhesD_m-6Hgc"},"source":["# Installing Jasper\n","%tensorflow_version 1.x\n","git_repo_url = 'https://github.com/NVIDIA/OpenSeq2Seq.git'\n","project_name = splitext(basename(git_repo_url))[0]\n","if not exists(project_name):\n"," # clone and install dependencies\n"," !git clone -q --depth 1 {git_repo_url}\n"," !git checkout e958b7d\n"," !pip uninstall -y -q pymc3\n"," !pip install --upgrade joblib\n"," #!cd {project_name} && pip install -q -r requirements.txt\n"," !pip install -q youtube-dl librosa python_speech_features sentencepiece\n"," \n"," # create eval config\n"," !cp {project_name}/example_configs/speech2text/jasper10x5_LibriSpeech_nvgrad.py {project_name}/conf.py\n"," !sed -i -e 's/\\/data\\/librispeech\\/librivox-test-clean/test/' {project_name}/conf.py\n"," #!sed -i -e 's/# \"use_lang/\"use_lang/' {project_name}/conf.py\n"," !echo 'backend = \"librosa\"' >> {project_name}/conf.py \n"," #!cat {project_name}/conf.py\n"," !echo \"wav_filename, wav_filesize, transcript\" > {project_name}/test.csv\n"," !echo \"test.wav, UNUSED, UNUSED\" >> {project_name}/test.csv\n","\n","# Download pretrained weights\n","def download_from_google_drive(file_id, file_name):\n"," # download a file from the Google Drive link\n"," !rm -f ./cookie\n"," !curl -c ./cookie -s -L \"https://drive.google.com/uc?export=download&id={file_id}\" > /dev/null\n"," confirm_text = !awk '/download/ {print $NF}' ./cookie\n"," confirm_text = confirm_text[0]\n"," !curl -Lb ./cookie \"https://drive.google.com/uc?export=download&confirm={confirm_text}&id={file_id}\" -o {file_name}\n"," \n","if not exists(join(project_name, 'w2l_log_folder')):\n"," download_from_google_drive('1gzGT8HoVNKY1i5HNQTKaSoCu7JHV4siR', 'jasper_10x5_dr_sp_nvgrad.zip')\n"," !unzip jasper_10x5_dr_sp_nvgrad.zip\n"," !mv checkpoint {project_name}/jasper_log_folder"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"kg_b6Q8BOQrs"},"source":["#!ffmpeg -loglevel panic -y -i downloaded.wav -acodec pcm_s16le -ac 1 -ar 16000 {project_name}/test.wav\n","!ffmpeg -i test2.wav -ar 16000 $project_name/test.wav\n","!cd {project_name} && python run.py --config_file conf.py --mode=infer --infer_output_file=output.txt --use_horovod=False --num_gpus=1 --batch_size_per_gpu 1"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"ba3fGtOqNKSh","colab":{"base_uri":"https://localhost:8080/","height":34},"outputId":"a17cc98b-2f8d-42f3-8856-ed6db2a8f8c3","executionInfo":{"status":"ok","timestamp":1587648597425,"user_tz":-330,"elapsed":69198,"user":{"displayName":"Sparsh Agarwal","photoUrl":"","userId":"13037694610922482904"}}},"source":["!tail -n1 {project_name}/output.txt"],"execution_count":null,"outputs":[{"output_type":"stream","text":["tail: cannot open 'OpenSeq2Seq/output.txt' for reading: No such file or directory\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"jlsbGs7qOc-1"},"source":["### QuartzNet"]},{"cell_type":"code","metadata":{"id":"Ql37_kXg6S-Y"},"source":["# Installing QuartzNet\n","!pip -q install wget youtube-dl wget tensorboardX kaldi-io marshmallow num2words ruamel.yaml soundfile sox torch-stft unidecode\n","!pip install -q nemo-toolkit==0.9.0 nemo-asr==0.9.0 #--no-deps\n","\n","# we need also Apex\n","if not exists('apex'):\n"," !git clone -q --depth 1 https://github.com/NVIDIA/apex\n"," !cd apex && pip install -q --no-cache-dir ./\n","\n","# Download pre-trained weights\n","if not exists('quartznet15x5_multidataset'):\n"," # download the pretrained weights\n"," !wget -nc -q --show-progress -O quartznet15x5.zip https://api.ngc.nvidia.com/v2/models/nvidia/multidataset_quartznet15x5/versions/1/zip\n"," !unzip quartznet15x5.zip && mkdir quartznet15x5_multidataset && mv Jasper* quartznet15x5.yaml quartznet15x5_multidataset\n","\n","\n","# Install\n","import json\n","from ruamel.yaml import YAML\n","import nemo\n","import nemo_asr\n","\n","WORK_DIR = \"/content/quartznet15x5_multidataset\"\n","MODEL_YAML = \"/content/quartznet15x5_multidataset/quartznet15x5.yaml\"\n","CHECKPOINT_ENCODER = \"/content/quartznet15x5_multidataset/JasperEncoder-STEP-243800.pt\"\n","CHECKPOINT_DECODER = \"/content/quartznet15x5_multidataset/JasperDecoderForCTC-STEP-243800.pt\"\n","# Set this to True to enable beam search decoder\n","ENABLE_NGRAM = False\n","# This is only necessary if ENABLE_NGRAM = True. Otherwise, set to empty string\n","LM_PATH = \"\"\n","\n","# Read model YAML\n","yaml = YAML(typ=\"safe\")\n","with open(MODEL_YAML) as f:\n"," jasper_model_definition = yaml.load(f)\n","labels = jasper_model_definition['labels']\n","\n","# Instantiate necessary Neural Modules\n","# Note that data layer is missing from here\n","neural_factory = nemo.core.NeuralModuleFactory(\n"," placement=nemo.core.DeviceType.GPU,\n"," backend=nemo.core.Backend.PyTorch)\n","data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(factory=neural_factory)\n","jasper_encoder = nemo_asr.JasperEncoder(\n"," jasper=jasper_model_definition['JasperEncoder']['jasper'],\n"," activation=jasper_model_definition['JasperEncoder']['activation'],\n"," feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'])\n","jasper_encoder.restore_from(CHECKPOINT_ENCODER, local_rank=0)\n","jasper_decoder = nemo_asr.JasperDecoderForCTC(\n"," feat_in=1024,\n"," num_classes=len(labels))\n","jasper_decoder.restore_from(CHECKPOINT_DECODER, local_rank=0)\n","greedy_decoder = nemo_asr.GreedyCTCDecoder()\n","\n","def wav_to_text(manifest, greedy=True):\n"," from ruamel.yaml import YAML\n"," yaml = YAML(typ=\"safe\")\n"," with open(MODEL_YAML) as f:\n"," jasper_model_definition = yaml.load(f)\n"," labels = jasper_model_definition['labels']\n","\n"," # Instantiate necessary neural modules\n"," data_layer = nemo_asr.AudioToTextDataLayer(\n"," shuffle=False,\n"," manifest_filepath=manifest,\n"," labels=labels, batch_size=1)\n","\n"," # Define inference DAG\n"," audio_signal, audio_signal_len, _, _ = data_layer()\n"," processed_signal, processed_signal_len = data_preprocessor(\n"," input_signal=audio_signal,\n"," length=audio_signal_len)\n"," encoded, encoded_len = jasper_encoder(audio_signal=processed_signal,\n"," length=processed_signal_len)\n"," log_probs = jasper_decoder(encoder_output=encoded)\n"," predictions = greedy_decoder(log_probs=log_probs)\n","\n"," if ENABLE_NGRAM:\n"," print('Running with beam search')\n"," beam_predictions = beam_search_with_lm(\n"," log_probs=log_probs, log_probs_length=encoded_len)\n"," eval_tensors = [beam_predictions]\n","\n"," if greedy:\n"," eval_tensors = [predictions]\n","\n"," tensors = neural_factory.infer(tensors=eval_tensors)\n"," if greedy:\n"," from nemo_asr.helpers import post_process_predictions\n"," prediction = post_process_predictions(tensors[0], labels)\n"," else:\n"," prediction = tensors[0][0][0][0][1]\n"," return prediction\n","\n","def create_manifest(file_path):\n"," # create manifest\n"," manifest = dict()\n"," manifest['audio_filepath'] = file_path\n"," manifest['duration'] = 18000\n"," manifest['text'] = 'todo'\n"," with open(file_path+\".json\", 'w') as fout:\n"," fout.write(json.dumps(manifest))\n"," return file_path+\".json\""],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"mF3WpWWy5cGx"},"source":["!ffmpeg -loglevel panic -y -i downloaded.wav -acodec pcm_s16le -ac 1 -ar 16000 test.wav\n","\n","transcription = wav_to_text(create_manifest('test.wav'))\n","\n","print('\\n\\n')\n","print(transcription)"],"execution_count":null,"outputs":[]}]}