{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 2: Processing data for machine learning\n", "\n", "To simplify the code examples in these notebooks, we populate the namespace with functions from numpy and matplotlib:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "source": [ "%pylab inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Converting categorical data to numerical features" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cat_data = array(['male', 'female', 'male', 'male', 'female', 'male', 'female', 'female'])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def cat_to_num(data):\n", " categories = unique(data)\n", " features = []\n", " for cat in categories:\n", " binary = (data == cat)\n", " features.append(binary.astype(\"int\"))\n", " return features" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[array([0, 1, 0, 0, 1, 0, 1, 1]), array([1, 0, 1, 1, 0, 1, 0, 0])]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat_to_num(cat_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Simple feature engineering of the Titanic dataset" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": true }, "outputs": [], "source": [ "cabin_data = array([\"C65\", \"\", \"E36\", \"C54\", \"B57 B59 B63 B66\"])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def cabin_features(data):\n", " features = []\n", " for cabin in data:\n", " cabins = cabin.split(\" \")\n", " n_cabins = len(cabins)\n", " # First char is the cabin_char\n", " try:\n", " cabin_char = cabins[0][0]\n", " except IndexError:\n", " cabin_char = \"X\"\n", " n_cabins = 0\n", " # The rest is the cabin number\n", " try:\n", " cabin_num = int(cabins[0][1:]) \n", " except:\n", " cabin_num = -1\n", " # Add 3 features for each passanger\n", " features.append( [cabin_char, cabin_num, n_cabins] )\n", " return features" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[['C', 65, 1], ['X', -1, 0], ['E', 36, 1], ['C', 54, 1], ['B', 57, 4]]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cabin_features(cabin_data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature normalization" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": true }, "outputs": [], "source": [ "num_data = array([1, 10, 0.5, 43, 0.12, 8])" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def normalize_feature(data, f_min=-1, f_max=1):\n", " d_min, d_max = min(data), max(data)\n", " factor = (f_max - f_min) / (d_max - d_min)\n", " normalized = f_min + data*factor\n", " return normalized, factor\n" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(array([-0.95335821, -0.53358209, -0.9766791 , 1.00559701, -0.99440299,\n", " -0.62686567]), 0.046641791044776115)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "normalize_feature(num_data)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }