{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# import Speaker Recognition Module\n", "import speaker.recognition as SR\n", "Gender = SR.GMMRec()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import scipy.io.wavfile as wav\n", "from speaker.silence import remove_silence\n", "from features import mfcc\n", "def get_mfcc(audio_path): \n", " (sr, sig) = wav.read(audio_path)\n", " if len(sig.shape) > 1:\n", " sig = sig[:, 0] \n", " cleansig = remove_silence(sr, sig)\n", " mfcc_vecs = mfcc(cleansig, sr, numcep = 15)\n", " return mfcc_vecs" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import numpy as np\n", "\n", "# Here we use mfcc as the audio features, but in theory, other audio features should work as well, e.g. lpc\n", "female_mfcc = np.array(get_mfcc('./Audio/female.wav')) # female_mfcc.shape = (N1, D); N1 vectors and D dimension\n", "male_mfcc = np.array(get_mfcc('./Audio/male.wav')) # male_mfcc.shape = (N2, D);\n", "Gender.enroll('Female', female_mfcc) # enroll the female audio features\n", "Gender.enroll('Male', male_mfcc) # enroll the male audio features\n", "Gender.train() # train the GMMs with PyCASP\n", "Gender.dump('gender.model') # save the trained model into a file named \"gender.model\" for future use" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "('Female', -22.373874909185876)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Gender = SR.GMMRec.load('gender.model') # this is not necessary if you just trained the model\n", "test_mfcc = np.array(get_mfcc('/Users/xuhe/Downloads/SpectrogramInversion1.02b/tapestr_rec.wav')) # test_mfcc.shape = (N3, D)\n", "Gender.predict(test_mfcc) # predict the speaker, where result is the most porbabel speaker label, and log_lkld is the log likelihood for test_mfcc to be from the recognized speaker. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def totime(secs):\n", " m, s = divmod(secs, 60)\n", " h, m = divmod(m, 60)\n", " return h, m, s\n", "\n", "def showresult(recognizer, sig, sr, head):\n", " cleansig = remove_silence(sr, sig)\n", " mfcc_vecs = mfcc(cleansig, sr, numcep = 15) \n", " print(\"%d:%02d:%02d\" % (totime(head)), recognizer.predict(\n", " mfcc_vecs))\n", "\n", "def recognize(recognizer, audio_path, step = 1, duration = 1.5):\n", " (fs, signal) = wav.read(audio_path)\n", " if len(signal.shape) > 1:\n", " signal = signal[:, 0]\n", " head = 0\n", " totallen = np.round(signal.shape[0] / fs).astype(int)\n", " print('Recognition results:')\n", " while head < totallen:\n", " tail = head + duration\n", " if tail > totallen:\n", " tail = totallen\n", " signali = signal[fs * head : np.min([fs * tail, fs * totallen])] \n", " showresult(recognizer, signali, fs, head)\n", " head += step" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Recognition results:\n", "('0:00:00', ('Male', -19.65672572544716))\n", "('0:00:05', ('Male', -19.389260191396541))\n", "('0:00:10', ('Male', -19.886238792273502))\n", "('0:00:15', ('Male', -19.988046642253273))\n", "('0:00:20', ('Male', -20.857762606257122))\n", "('0:00:25', ('Female', -20.056092628403363))\n", "('0:00:30', ('Female', -19.888043075692561))\n", "('0:00:35', ('Female', -19.657557661472801))\n", "('0:00:40', ('Female', -19.745223859738523))\n", "('0:00:45', ('Female', -19.680926940400678))\n", "('0:00:50', ('Female', -19.458031006355842))\n", "('0:00:55', ('Female', -19.553981803248707))\n", "('0:01:00', ('Female', -20.053499089615951))\n", "('0:01:05', ('Male', -19.686199644242794))\n", "('0:01:10', ('Male', -19.852808517223))\n", "('0:01:15', ('Male', -20.039521601708593))\n", "('0:01:20', ('Male', -19.904757723357431))\n", "('0:01:25', ('Male', -20.143563372546421))\n", "('0:01:30', ('Male', -19.966010831665649))\n", "('0:01:35', ('Male', -19.826530139561765))\n", "('0:01:40', ('Male', -19.912105539081182))\n", "('0:01:45', ('Male', -19.848151795975433))\n", "('0:01:50', ('Male', -19.684047168185359))\n", "('0:01:55', ('Male', -19.983463014416124))\n", "('0:02:00', ('Male', -19.397841075840084))\n", "('0:02:05', ('Male', -19.766136825379665))\n", "('0:02:10', ('Female', -19.768935512293602))\n" ] } ], "source": [ "recognize(Gender, './Audio/female-male.wav', step = 5, duration = 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }