{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 2: Processing data for machine learning\n",
    "\n",
    "To simplify the code examples in these notebooks, we populate the namespace with functions from numpy and matplotlib:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "%pylab inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Converting categorical data to numerical features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cat_data = array(['male', 'female', 'male', 'male', 'female', 'male', 'female', 'female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def cat_to_num(data):\n",
    "    categories = unique(data)\n",
    "    features = []\n",
    "    for cat in categories:\n",
    "        binary = (data == cat)\n",
    "        features.append(binary.astype(\"int\"))\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([0, 1, 0, 0, 1, 0, 1, 1]), array([1, 0, 1, 1, 0, 1, 0, 0])]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cat_to_num(cat_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simple feature engineering of the Titanic dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cabin_data = array([\"C65\", \"\", \"E36\", \"C54\", \"B57 B59 B63 B66\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def cabin_features(data):\n",
    "    features = []\n",
    "    for cabin in data:\n",
    "        cabins = cabin.split(\" \")\n",
    "        n_cabins = len(cabins)\n",
    "        # First char is the cabin_char\n",
    "        try:\n",
    "            cabin_char = cabins[0][0]\n",
    "        except IndexError:\n",
    "            cabin_char = \"X\"\n",
    "            n_cabins = 0\n",
    "        # The rest is the cabin number\n",
    "        try:\n",
    "            cabin_num = int(cabins[0][1:]) \n",
    "        except:\n",
    "            cabin_num = -1\n",
    "        # Add 3 features for each passanger\n",
    "        features.append( [cabin_char, cabin_num, n_cabins] )\n",
    "    return features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['C', 65, 1], ['X', -1, 0], ['E', 36, 1], ['C', 54, 1], ['B', 57, 4]]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cabin_features(cabin_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "num_data = array([1, 10, 0.5, 43, 0.12, 8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def normalize_feature(data, f_min=-1, f_max=1):\n",
    "    d_min, d_max = min(data), max(data)\n",
    "    factor = (f_max - f_min) / (d_max - d_min)\n",
    "    normalized = f_min + data*factor\n",
    "    return normalized, factor\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([-0.95335821, -0.53358209, -0.9766791 ,  1.00559701, -0.99440299,\n",
       "        -0.62686567]), 0.046641791044776115)"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normalize_feature(num_data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}