{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# import Speaker Recognition Module\n",
    "import speaker.recognition as SR\n",
    "Gender = SR.GMMRec()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import scipy.io.wavfile as wav\n",
    "from speaker.silence import remove_silence\n",
    "from features import mfcc\n",
    "def get_mfcc(audio_path):    \n",
    "    (sr, sig) = wav.read(audio_path)\n",
    "    if len(sig.shape) > 1:\n",
    "        sig = sig[:, 0]    \n",
    "    cleansig = remove_silence(sr, sig)\n",
    "    mfcc_vecs = mfcc(cleansig, sr, numcep = 15)\n",
    "    return mfcc_vecs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "# Here we use mfcc as the audio features, but in theory, other audio features should work as well, e.g. lpc\n",
    "female_mfcc = np.array(get_mfcc('./Audio/female.wav')) # female_mfcc.shape = (N1, D); N1 vectors and D dimension\n",
    "male_mfcc = np.array(get_mfcc('./Audio/male.wav')) # male_mfcc.shape = (N2, D);\n",
    "Gender.enroll('Female', female_mfcc) # enroll the female audio features\n",
    "Gender.enroll('Male', male_mfcc) # enroll the male audio features\n",
    "Gender.train() # train the GMMs with PyCASP\n",
    "Gender.dump('gender.model') # save the trained model into a file named \"gender.model\" for future use"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Female', -22.373874909185876)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Gender = SR.GMMRec.load('gender.model') # this is not necessary if you just trained the model\n",
    "test_mfcc = np.array(get_mfcc('/Users/xuhe/Downloads/SpectrogramInversion1.02b/tapestr_rec.wav')) # test_mfcc.shape = (N3, D)\n",
    "Gender.predict(test_mfcc) # predict the speaker, where result is the most porbabel speaker label, and log_lkld is the log likelihood for test_mfcc to be from the recognized speaker. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def totime(secs):\n",
    "    m, s = divmod(secs, 60)\n",
    "    h, m = divmod(m, 60)\n",
    "    return h, m, s\n",
    "\n",
    "def showresult(recognizer, sig, sr, head):\n",
    "    cleansig = remove_silence(sr, sig)\n",
    "    mfcc_vecs = mfcc(cleansig, sr, numcep = 15)    \n",
    "    print(\"%d:%02d:%02d\" % (totime(head)), recognizer.predict(\n",
    "                mfcc_vecs))\n",
    "\n",
    "def recognize(recognizer, audio_path, step = 1, duration = 1.5):\n",
    "    (fs, signal) = wav.read(audio_path)\n",
    "    if len(signal.shape) > 1:\n",
    "        signal = signal[:, 0]\n",
    "    head = 0\n",
    "    totallen = np.round(signal.shape[0] / fs).astype(int)\n",
    "    print('Recognition results:')\n",
    "    while head < totallen:\n",
    "        tail = head + duration\n",
    "        if tail > totallen:\n",
    "            tail = totallen\n",
    "        signali = signal[fs * head : np.min([fs * tail, fs * totallen])]           \n",
    "        showresult(recognizer, signali, fs, head)\n",
    "        head += step"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Recognition results:\n",
      "('0:00:00', ('Male', -19.65672572544716))\n",
      "('0:00:05', ('Male', -19.389260191396541))\n",
      "('0:00:10', ('Male', -19.886238792273502))\n",
      "('0:00:15', ('Male', -19.988046642253273))\n",
      "('0:00:20', ('Male', -20.857762606257122))\n",
      "('0:00:25', ('Female', -20.056092628403363))\n",
      "('0:00:30', ('Female', -19.888043075692561))\n",
      "('0:00:35', ('Female', -19.657557661472801))\n",
      "('0:00:40', ('Female', -19.745223859738523))\n",
      "('0:00:45', ('Female', -19.680926940400678))\n",
      "('0:00:50', ('Female', -19.458031006355842))\n",
      "('0:00:55', ('Female', -19.553981803248707))\n",
      "('0:01:00', ('Female', -20.053499089615951))\n",
      "('0:01:05', ('Male', -19.686199644242794))\n",
      "('0:01:10', ('Male', -19.852808517223))\n",
      "('0:01:15', ('Male', -20.039521601708593))\n",
      "('0:01:20', ('Male', -19.904757723357431))\n",
      "('0:01:25', ('Male', -20.143563372546421))\n",
      "('0:01:30', ('Male', -19.966010831665649))\n",
      "('0:01:35', ('Male', -19.826530139561765))\n",
      "('0:01:40', ('Male', -19.912105539081182))\n",
      "('0:01:45', ('Male', -19.848151795975433))\n",
      "('0:01:50', ('Male', -19.684047168185359))\n",
      "('0:01:55', ('Male', -19.983463014416124))\n",
      "('0:02:00', ('Male', -19.397841075840084))\n",
      "('0:02:05', ('Male', -19.766136825379665))\n",
      "('0:02:10', ('Female', -19.768935512293602))\n"
     ]
    }
   ],
   "source": [
    "recognize(Gender, './Audio/female-male.wav', step = 5, duration = 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}