{ "cells": [ { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.cross_validation import train_test_split, cross_val_score\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn import svm\n", "import csv" ] }, { "cell_type": "code", "execution_count": 104, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#this function converts name to a feature vector \n", "def convert_name(name):\n", " arr = np.zeros(26*26+5)\n", " #26*26 all possiable 2-grams + 4 for last 1, 3, 3 excluding last\n", " name = str(name)\n", " #Iterate every 2 characters 2gram and caliculate the frequency of the 2gram\n", " for x in range(len(name)-1):\n", " ind = (ord(name[x])-ord('a'))*26 + (ord(name[x+1])-ord('a'))\n", " arr[ind] += 1\n", " \n", " # check if Last character is vowel\n", " if (name[-1]=='a' or name[-1]=='e' or name[-1]=='i' or name[-1]=='o' or name[-1]=='u'):\n", " arr[-1] = 1\n", " else:\n", " arr[-1] = 0\n", " \n", " #check last 3 characters and set the value to 1 if the last 3 character key is found in the features dictionary for that index \n", " if name[-3:] in my_features.keys():\n", " arr[-2]=my_features[name[-3:]]\n", "\n", " #check 3 characters from last skiping last 3 \n", " if name[-6:][:3] in my_features.keys():\n", " arr[-3]=my_features[name[-6:][:3]]\n", "\n", " #check 3 characters from last skiping last 1 \n", " if name[-4:][:3] in my_features.keys():\n", " arr[-4]=my_features[name[-4:][:3]]\n", " \n", " #first 3 characters \n", " if name[:3] in my_features.keys():\n", " arr[-5]=my_features[name[:3]]\n", " return arr" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#load data\n", "my_data = np.genfromtxt('data/gender.csv', \n", " delimiter=',', \n", " dtype=[('name','" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#check important features\n", "importances = clf.feature_importances_\n", "std = np.std([tree.feature_importances_ for tree in clf.estimators_],\n", " axis=0)\n", "indices = np.argsort(importances)[::-1]\n", "indices = indices[:10]\n", "# Print the feature ranking\n", "print(\"Feature ranking:\")\n", "\n", "for f in range(10):\n", " print(\"%d. feature %d (%f)\" % (f + 1, indices[f], importances[indices[f]]))\n", "\n", "import matplotlib.pyplot as plt\n", "plt.figure()\n", "plt.title(\"Feature importances\")\n", "plt.bar(range(10), importances[indices],\n", " color=\"r\", yerr=std[indices], align=\"center\")\n", "plt.xticks(range(10), indices)\n", "plt.xlim([-1, 10])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "vaibahv 1 1\n", "sagrika 0 0\n", "ramashray 1 1\n", "karekar 1 1\n", "harshika 0 0\n", "deepaak 1 1\n", "fulo 0 0\n", "mahiya 0 0\n", "punyasloka 1 1\n", "pemaram 1 1\n" ] } ], "source": [ "idx = np.random.choice(np.arange(len(Xlist)), 10, replace=False)\n", "xs = my_data['name'][idx]\n", "ys = y[idx]\n", "pred = clf.predict(X[idx])\n", "for a,b, p in zip(xs,ys, pred):\n", " print(a,b,p)" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1])" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#give some name not present in test/training data (manual check)\n", "xreal = name_map(['virinchi','samanvi','ashrith','saikeerthi','seol','vistista','kranthi','ramraji','anasuya','mangli','keerthi','reddy','shastry'])\n", "Xreal_conveted = np.array(xreal.tolist())\n", "pred = clf.predict(Xreal_conveted)\n", "pred" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }