{
 "metadata": {
  "name": ""
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Building a fashion recommender (III): Content based recommender"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "__The explanation of this implementation can be found at: http://www.rosariomgomez.me/__ <br>\n",
      "\n",
      "__Index__<br>\n",
      "1. [Build the training and testing sets](#1.-Build-the-training-and-testing-sets)<br>\n",
      "2. [Estimation functions](#2.-Estimation-functions)<br>\n",
      "3. [Content based recommendations](#3.-Content-based-recommendation-engine)<br>"
     ]
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "1. Build the training and testing sets"
     ]
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "1.1. Build the dataframes"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np\n",
      "import pandas as pd\n",
      "from create_features import create_pin_features, create_user_features"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_db():\n",
      "    from pymongo import MongoClient\n",
      "    client = MongoClient('server', port) #server, port\n",
      "    db = client.database_name #database name\n",
      "    db.authenticate(\"user\", \"pwd\")\n",
      "    return db"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "db = get_db()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#retrieve all the ratings from the DB\n",
      "rated_outfits = db.ratings.find()\n",
      "list_ratings = [rate for rate in rated_outfits]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#retrieve all the users from the DB and build the feature vectors\n",
      "all_users = db.user.find()\n",
      "list_users = [create_user_features(user) for user in all_users]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#retrieve all items from the DB and build the feature vectors\n",
      "all_pins = db.fullpin.find()\n",
      "list_pins = [create_pin_features(pin) for pin in all_pins]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#build the pandas items dataframe\n",
      "items = pd.DataFrame(list_pins)\n",
      "items = items.rename(columns = {'_id':'pin_id'}) #to be in line with the ratings names\n",
      "items.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>pin_id</th>\n",
        "      <th>blog_name</th>\n",
        "      <th>blogger_age</th>\n",
        "      <th>blogger_body_shape_apple</th>\n",
        "      <th>blogger_body_shape_hourglass</th>\n",
        "      <th>blogger_body_shape_inverted_triangle</th>\n",
        "      <th>blogger_body_shape_pear</th>\n",
        "      <th>blogger_body_shape_rectangle</th>\n",
        "      <th>blogger_dress_size</th>\n",
        "      <th>blogger_style</th>\n",
        "      <th>blogger_style_bohemian chic</th>\n",
        "      <th>blogger_style_casual chic</th>\n",
        "      <th>blogger_style_classic</th>\n",
        "      <th>blogger_style_edgy</th>\n",
        "      <th>blogger_style_preppy</th>\n",
        "      <th>blogger_style_romantic</th>\n",
        "      <th>brands_ASOS</th>\n",
        "      <th>brands_Abercrombie &amp; Fitch</th>\n",
        "      <th>brands_Accessorize</th>\n",
        "      <th>brands_Alexander McQueen</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 537934c861e01f10f1118dea</td>\n",
        "      <td> Hallie Daily</td>\n",
        "      <td> 40</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 6</td>\n",
        "      <td> [classic, romantic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 537934d261e01f10f1118e1b</td>\n",
        "      <td> Hallie Daily</td>\n",
        "      <td> 40</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 6</td>\n",
        "      <td>           [classic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 537934cc61e01f10f1118e00</td>\n",
        "      <td> Hallie Daily</td>\n",
        "      <td> 40</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 6</td>\n",
        "      <td>           [classic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 537934c661e01f10f1118ddf</td>\n",
        "      <td> Hallie Daily</td>\n",
        "      <td> 40</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 6</td>\n",
        "      <td> [classic, romantic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 537934c661e01f10f1118ddd</td>\n",
        "      <td> Hallie Daily</td>\n",
        "      <td> 40</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 6</td>\n",
        "      <td>           [classic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 319 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "                     pin_id     blog_name  blogger_age  \\\n",
        "0  537934c861e01f10f1118dea  Hallie Daily           40   \n",
        "1  537934d261e01f10f1118e1b  Hallie Daily           40   \n",
        "2  537934cc61e01f10f1118e00  Hallie Daily           40   \n",
        "3  537934c661e01f10f1118ddf  Hallie Daily           40   \n",
        "4  537934c661e01f10f1118ddd  Hallie Daily           40   \n",
        "\n",
        "   blogger_body_shape_apple  blogger_body_shape_hourglass  \\\n",
        "0                         0                             1   \n",
        "1                         0                             1   \n",
        "2                         0                             1   \n",
        "3                         0                             1   \n",
        "4                         0                             1   \n",
        "\n",
        "   blogger_body_shape_inverted_triangle  blogger_body_shape_pear  \\\n",
        "0                                     0                        0   \n",
        "1                                     0                        0   \n",
        "2                                     0                        0   \n",
        "3                                     0                        0   \n",
        "4                                     0                        0   \n",
        "\n",
        "   blogger_body_shape_rectangle  blogger_dress_size        blogger_style  \\\n",
        "0                             0                   6  [classic, romantic]   \n",
        "1                             0                   6            [classic]   \n",
        "2                             0                   6            [classic]   \n",
        "3                             0                   6  [classic, romantic]   \n",
        "4                             0                   6            [classic]   \n",
        "\n",
        "   blogger_style_bohemian chic  blogger_style_casual chic  \\\n",
        "0                            0                          0   \n",
        "1                            0                          0   \n",
        "2                            0                          0   \n",
        "3                            0                          0   \n",
        "4                            0                          0   \n",
        "\n",
        "   blogger_style_classic  blogger_style_edgy  blogger_style_preppy  \\\n",
        "0                      1                   0                     0   \n",
        "1                      1                   0                     0   \n",
        "2                      1                   0                     0   \n",
        "3                      1                   0                     0   \n",
        "4                      1                   0                     0   \n",
        "\n",
        "   blogger_style_romantic  brands_ASOS  brands_Abercrombie & Fitch  \\\n",
        "0                       1            0                           0   \n",
        "1                       0            0                           0   \n",
        "2                       0            0                           0   \n",
        "3                       1            0                           0   \n",
        "4                       0            0                           0   \n",
        "\n",
        "   brands_Accessorize  brands_Alexander McQueen      \n",
        "0                   0                         0 ...  \n",
        "1                   0                         0 ...  \n",
        "2                   0                         0 ...  \n",
        "3                   0                         0 ...  \n",
        "4                   0                         0 ...  \n",
        "\n",
        "[5 rows x 319 columns]"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#build the ratings dataframe\n",
      "cols = ['user_id', 'pin_id', 'rating']\n",
      "ratings = pd.DataFrame(list_ratings, columns=cols)\n",
      "ratings.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>user_id</th>\n",
        "      <th>pin_id</th>\n",
        "      <th>rating</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 538677f561e01f0be9e838f7</td>\n",
        "      <td> 537933bb61e01f10f111886f</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 538677f561e01f0be9e838f7</td>\n",
        "      <td> 537933a161e01f10f11187e6</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 538677f561e01f0be9e838f7</td>\n",
        "      <td> 53793d6d61e01f10f111a725</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 538677f561e01f0be9e838f7</td>\n",
        "      <td> 537933be61e01f10f111887f</td>\n",
        "      <td> 2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 538677f561e01f0be9e838f7</td>\n",
        "      <td> 53793d8861e01f10f111a741</td>\n",
        "      <td> 0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 3 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 8,
       "text": [
        "                    user_id                    pin_id  rating\n",
        "0  538677f561e01f0be9e838f7  537933bb61e01f10f111886f       0\n",
        "1  538677f561e01f0be9e838f7  537933a161e01f10f11187e6       0\n",
        "2  538677f561e01f0be9e838f7  53793d6d61e01f10f111a725       2\n",
        "3  538677f561e01f0be9e838f7  537933be61e01f10f111887f       2\n",
        "4  538677f561e01f0be9e838f7  53793d8861e01f10f111a741       0\n",
        "\n",
        "[5 rows x 3 columns]"
       ]
      }
     ],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#build the pandas users dataframe\n",
      "users = pd.DataFrame(list_users)\n",
      "users = users.rename(columns = {'_id':'user_id'}) #to be in line with the ratings names\n",
      "users.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>user_id</th>\n",
        "      <th>age</th>\n",
        "      <th>country</th>\n",
        "      <th>day_off</th>\n",
        "      <th>dress_size</th>\n",
        "      <th>fashionista</th>\n",
        "      <th>like_styles_pref</th>\n",
        "      <th>like_styles_pref_bohemian chic</th>\n",
        "      <th>like_styles_pref_casual chic</th>\n",
        "      <th>like_styles_pref_classic</th>\n",
        "      <th>like_styles_pref_edgy</th>\n",
        "      <th>like_styles_pref_preppy</th>\n",
        "      <th>like_styles_pref_romantic</th>\n",
        "      <th>nolike_styles_pref</th>\n",
        "      <th>nolike_styles_pref_bohemian chic</th>\n",
        "      <th>nolike_styles_pref_casual chic</th>\n",
        "      <th>nolike_styles_pref_classic</th>\n",
        "      <th>nolike_styles_pref_edgy</th>\n",
        "      <th>nolike_styles_pref_preppy</th>\n",
        "      <th>nolike_styles_pref_romantic</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 53962dfa3191490008a690df</td>\n",
        "      <td> 55</td>\n",
        "      <td> US</td>\n",
        "      <td>  sport</td>\n",
        "      <td> 10</td>\n",
        "      <td> nolike</td>\n",
        "      <td>  [classic, casual chic, preppy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td>                    []</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 53968e993d4e0c0007a2546f</td>\n",
        "      <td> 50</td>\n",
        "      <td> US</td>\n",
        "      <td> family</td>\n",
        "      <td>  6</td>\n",
        "      <td> nolike</td>\n",
        "      <td>                              []</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>                [edgy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 53971683b7d85a0008b1bbe2</td>\n",
        "      <td> 30</td>\n",
        "      <td> ES</td>\n",
        "      <td> family</td>\n",
        "      <td>  8</td>\n",
        "      <td> nolike</td>\n",
        "      <td>         [romantic, casual chic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>                [edgy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 539851a17f6ba70007ba8bdb</td>\n",
        "      <td> 30</td>\n",
        "      <td> ES</td>\n",
        "      <td>  party</td>\n",
        "      <td> 10</td>\n",
        "      <td> nolike</td>\n",
        "      <td> [romantic, casual chic, preppy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td>                [edgy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 539770c4a9a4570008c28a9b</td>\n",
        "      <td> 45</td>\n",
        "      <td> ES</td>\n",
        "      <td> family</td>\n",
        "      <td> 10</td>\n",
        "      <td>     ok</td>\n",
        "      <td>  [classic, casual chic, preppy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> [bohemian chic, edgy]</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 27 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "                    user_id  age country day_off  dress_size fashionista  \\\n",
        "0  53962dfa3191490008a690df   55      US   sport          10      nolike   \n",
        "1  53968e993d4e0c0007a2546f   50      US  family           6      nolike   \n",
        "2  53971683b7d85a0008b1bbe2   30      ES  family           8      nolike   \n",
        "3  539851a17f6ba70007ba8bdb   30      ES   party          10      nolike   \n",
        "4  539770c4a9a4570008c28a9b   45      ES  family          10          ok   \n",
        "\n",
        "                  like_styles_pref  like_styles_pref_bohemian chic  \\\n",
        "0   [classic, casual chic, preppy]                               0   \n",
        "1                               []                               0   \n",
        "2          [romantic, casual chic]                               0   \n",
        "3  [romantic, casual chic, preppy]                               0   \n",
        "4   [classic, casual chic, preppy]                               0   \n",
        "\n",
        "   like_styles_pref_casual chic  like_styles_pref_classic  \\\n",
        "0                             1                         1   \n",
        "1                             0                         0   \n",
        "2                             1                         0   \n",
        "3                             1                         0   \n",
        "4                             1                         1   \n",
        "\n",
        "   like_styles_pref_edgy  like_styles_pref_preppy  like_styles_pref_romantic  \\\n",
        "0                      0                        1                          0   \n",
        "1                      0                        0                          0   \n",
        "2                      0                        0                          1   \n",
        "3                      0                        1                          1   \n",
        "4                      0                        1                          0   \n",
        "\n",
        "      nolike_styles_pref  nolike_styles_pref_bohemian chic  \\\n",
        "0                     []                                 0   \n",
        "1                 [edgy]                                 0   \n",
        "2                 [edgy]                                 0   \n",
        "3                 [edgy]                                 0   \n",
        "4  [bohemian chic, edgy]                                 1   \n",
        "\n",
        "   nolike_styles_pref_casual chic  nolike_styles_pref_classic  \\\n",
        "0                               0                           0   \n",
        "1                               0                           0   \n",
        "2                               0                           0   \n",
        "3                               0                           0   \n",
        "4                               0                           0   \n",
        "\n",
        "   nolike_styles_pref_edgy  nolike_styles_pref_preppy  \\\n",
        "0                        0                          0   \n",
        "1                        1                          0   \n",
        "2                        1                          0   \n",
        "3                        1                          0   \n",
        "4                        1                          0   \n",
        "\n",
        "   nolike_styles_pref_romantic      \n",
        "0                            0 ...  \n",
        "1                            0 ...  \n",
        "2                            0 ...  \n",
        "3                            0 ...  \n",
        "4                            0 ...  \n",
        "\n",
        "[5 rows x 27 columns]"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#merge ratings, items and users dataframes\n",
      "fashion = pd.merge(pd.merge(ratings, users), items)\n",
      "fashion.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>user_id</th>\n",
        "      <th>pin_id</th>\n",
        "      <th>rating</th>\n",
        "      <th>age</th>\n",
        "      <th>country</th>\n",
        "      <th>day_off</th>\n",
        "      <th>dress_size</th>\n",
        "      <th>fashionista</th>\n",
        "      <th>like_styles_pref</th>\n",
        "      <th>like_styles_pref_bohemian chic</th>\n",
        "      <th>like_styles_pref_casual chic</th>\n",
        "      <th>like_styles_pref_classic</th>\n",
        "      <th>like_styles_pref_edgy</th>\n",
        "      <th>like_styles_pref_preppy</th>\n",
        "      <th>like_styles_pref_romantic</th>\n",
        "      <th>nolike_styles_pref</th>\n",
        "      <th>nolike_styles_pref_bohemian chic</th>\n",
        "      <th>nolike_styles_pref_casual chic</th>\n",
        "      <th>nolike_styles_pref_classic</th>\n",
        "      <th>nolike_styles_pref_edgy</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
        "      <td> 538677f561e01f0be9e838f7</td>\n",
        "      <td> 537933bb61e01f10f111886f</td>\n",
        "      <td> 0</td>\n",
        "      <td> 30</td>\n",
        "      <td> US</td>\n",
        "      <td>  sport</td>\n",
        "      <td>  8</td>\n",
        "      <td>   love</td>\n",
        "      <td> [classic, casual chic, preppy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td>                [edgy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td> 539604e40aa8e20007a976fb</td>\n",
        "      <td> 537933bb61e01f10f111886f</td>\n",
        "      <td> 2</td>\n",
        "      <td> 30</td>\n",
        "      <td> ES</td>\n",
        "      <td>   read</td>\n",
        "      <td>  8</td>\n",
        "      <td>     ok</td>\n",
        "      <td>                  [casual chic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>                    []</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
        "      <td> 539616880aa8e20008b99e19</td>\n",
        "      <td> 537933bb61e01f10f111886f</td>\n",
        "      <td> 2</td>\n",
        "      <td> 25</td>\n",
        "      <td> ES</td>\n",
        "      <td> family</td>\n",
        "      <td>  6</td>\n",
        "      <td>     ok</td>\n",
        "      <td>          [casual chic, preppy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td>                [edgy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
        "      <td> 539628df3191490008a690d5</td>\n",
        "      <td> 537933bb61e01f10f111886f</td>\n",
        "      <td> 2</td>\n",
        "      <td> 40</td>\n",
        "      <td> US</td>\n",
        "      <td>  sport</td>\n",
        "      <td> 10</td>\n",
        "      <td>     ok</td>\n",
        "      <td>                      [classic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td>                [edgy]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
        "      <td> 5396d3a6989960000db79381</td>\n",
        "      <td> 537933bb61e01f10f111886f</td>\n",
        "      <td> 1</td>\n",
        "      <td> 30</td>\n",
        "      <td> ES</td>\n",
        "      <td>  party</td>\n",
        "      <td> 10</td>\n",
        "      <td> nolike</td>\n",
        "      <td>         [classic, casual chic]</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> [bohemian chic, edgy]</td>\n",
        "      <td> 1</td>\n",
        "      <td> 0</td>\n",
        "      <td> 0</td>\n",
        "      <td> 1</td>\n",
        "      <td>...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 347 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 10,
       "text": [
        "                    user_id                    pin_id  rating  age country  \\\n",
        "0  538677f561e01f0be9e838f7  537933bb61e01f10f111886f       0   30      US   \n",
        "1  539604e40aa8e20007a976fb  537933bb61e01f10f111886f       2   30      ES   \n",
        "2  539616880aa8e20008b99e19  537933bb61e01f10f111886f       2   25      ES   \n",
        "3  539628df3191490008a690d5  537933bb61e01f10f111886f       2   40      US   \n",
        "4  5396d3a6989960000db79381  537933bb61e01f10f111886f       1   30      ES   \n",
        "\n",
        "  day_off  dress_size fashionista                like_styles_pref  \\\n",
        "0   sport           8        love  [classic, casual chic, preppy]   \n",
        "1    read           8          ok                   [casual chic]   \n",
        "2  family           6          ok           [casual chic, preppy]   \n",
        "3   sport          10          ok                       [classic]   \n",
        "4   party          10      nolike          [classic, casual chic]   \n",
        "\n",
        "   like_styles_pref_bohemian chic  like_styles_pref_casual chic  \\\n",
        "0                               0                             1   \n",
        "1                               0                             1   \n",
        "2                               0                             1   \n",
        "3                               0                             0   \n",
        "4                               0                             1   \n",
        "\n",
        "   like_styles_pref_classic  like_styles_pref_edgy  like_styles_pref_preppy  \\\n",
        "0                         1                      0                        1   \n",
        "1                         0                      0                        0   \n",
        "2                         0                      0                        1   \n",
        "3                         1                      0                        0   \n",
        "4                         1                      0                        0   \n",
        "\n",
        "   like_styles_pref_romantic     nolike_styles_pref  \\\n",
        "0                          0                 [edgy]   \n",
        "1                          0                     []   \n",
        "2                          0                 [edgy]   \n",
        "3                          0                 [edgy]   \n",
        "4                          0  [bohemian chic, edgy]   \n",
        "\n",
        "   nolike_styles_pref_bohemian chic  nolike_styles_pref_casual chic  \\\n",
        "0                                 0                               0   \n",
        "1                                 0                               0   \n",
        "2                                 0                               0   \n",
        "3                                 0                               0   \n",
        "4                                 1                               0   \n",
        "\n",
        "   nolike_styles_pref_classic  nolike_styles_pref_edgy      \n",
        "0                           0                        1 ...  \n",
        "1                           0                        0 ...  \n",
        "2                           0                        1 ...  \n",
        "3                           0                        1 ...  \n",
        "4                           0                        1 ...  \n",
        "\n",
        "[5 rows x 347 columns]"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "heading",
     "level": 4,
     "metadata": {},
     "source": [
      "1.2. Create the training and testing sets"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#20% of each user data for testing\n",
      "def assign_to_set(df):\n",
      "    '''Randomly select 20% of indices from the dataframe and set the for_testing column to True\n",
      "    Input: dataframe\n",
      "    Output: dataframe'''\n",
      "    np.random.seed(1)\n",
      "    sampled_ids = np.random.choice(df.index, size=np.int64(np.ceil(df.index.size * 0.2)), replace=False)\n",
      "    df.ix[sampled_ids, 'for_testing'] = True\n",
      "    return df\n",
      "\n",
      "fashion['for_testing'] = False\n",
      "grouped = fashion.groupby('user_id', group_keys=False).apply(assign_to_set)\n",
      "fashion_train = fashion[grouped.for_testing == False]\n",
      "fashion_test = fashion[grouped.for_testing == True]\n",
      "print fashion.shape\n",
      "print fashion_train.shape\n",
      "print fashion_test.shape\n",
      "assert len(fashion_train.index & fashion_test.index) == 0  #ensure we don't have the same values on both sets"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "(2245, 348)\n",
        "(1783, 348)\n",
        "(462, 348)\n"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "2. Estimation functions"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#using RMSE as performance criterion\n",
      "def compute_rmse(y_pred, y_true):\n",
      "    '''Calculate the root mean square value between the predicted and true rating\n",
      "    Input: predicted rating, true rating\n",
      "    Output: RMSE'''\n",
      "    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def evaluate(estimate_f):\n",
      "    '''Calculate the RMSE to the passed as parameter recommendation function\n",
      "    Input: function to predict the rating of a tuple (user, item)\n",
      "    Output: (float) RMSE'''\n",
      "    ids_to_estimate = zip(fashion_test.user_id, fashion_test.pin_id) #list of tuples (user_id, pin_id)\n",
      "    estimated = np.array([estimate_f(u,p) for u,p in ids_to_estimate]) #apply the passed estimate function to the user,pin tuple\n",
      "    real = fashion_test.rating.values\n",
      "    return compute_rmse(estimated, real)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 13
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "3. Content based recommendation engine"
     ]
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "3.1. Mean items rating by user"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def estimate1(user_id, pin_id):\n",
      "    '''mean of user ratings'''\n",
      "    user_condition = fashion_train.user_id == user_id\n",
      "    return fashion_train.loc[user_condition, 'rating'].mean()\n",
      "\n",
      "print 'RMSE for estimate1: %s' % evaluate(estimate1)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "RMSE for estimate1: 0.834525165107\n"
       ]
      }
     ],
     "prompt_number": 14
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "3.2. Mean items rating grouped by blogger"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "#new items pivot table where the index is the item_id for getting the user item blogger\n",
      "items_info = items.set_index('pin_id')\n",
      "means_by_blogger = fashion_train.pivot_table(values='rating', rows='pin_id', cols='blog_name')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def estimate2(user_id, pin_id):\n",
      "    '''mean rating of same blogger'''\n",
      "    pin_blogger = items_info.ix[pin_id, 'blog_name']\n",
      "\n",
      "    if pin_blogger in means_by_blogger.columns:\n",
      "        return means_by_blogger.ix[:, pin_blogger].mean() #mean value for that blogger\n",
      "    else:\n",
      "        return 1\n",
      "    \n",
      "print 'RMSE for estimate2: %s' % evaluate(estimate2)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "RMSE for estimate2: 0.81514703353\n"
       ]
      }
     ],
     "prompt_number": 16
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "3.3. Mean items rating by item style"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def style_cond(pin_id):\n",
      "    '''items with the same styles'''\n",
      "    pin_styles = items_info.ix[pin_id, 'blogger_style']\n",
      "    same_style_cond = True\n",
      "    for style in pin_styles:\n",
      "        same_style_cond = (same_style_cond) & (fashion_train['blogger_style_' + style] == 1)\n",
      "    return same_style_cond"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def estimate3(user_id, pin_id):\n",
      "    '''mean rating of same pin style'''\n",
      "    \n",
      "    ratings_by_styles = fashion_train.loc[style_cond(pin_id)]\n",
      "    \n",
      "    if ratings_by_styles.empty:\n",
      "        return 1\n",
      "    else:\n",
      "        return ratings_by_styles.rating.mean()\n",
      "    \n",
      "print 'RMSE for estimate3: %s' % evaluate(estimate3)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "RMSE for estimate3: 0.825515788148\n"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "3.4. Mean items rating by item style and user"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def estimate4(user_id, pin_id):\n",
      "    '''mean of the items with the same style rated by the user'''\n",
      "    \n",
      "    user_condition = fashion_train.user_id == user_id\n",
      "    ratings_by_user_styles = fashion_train.loc[user_condition & style_cond(pin_id)]\n",
      "    \n",
      "    if ratings_by_user_styles.empty:\n",
      "        return 1\n",
      "    else:\n",
      "        return ratings_by_user_styles.rating.mean()\n",
      "    \n",
      "print 'RMSE for estimate4: %s' % evaluate(estimate4)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "RMSE for estimate4: 0.840272347162\n"
       ]
      }
     ],
     "prompt_number": 20
    },
    {
     "cell_type": "heading",
     "level": 3,
     "metadata": {},
     "source": [
      "3.5. Mean items rating by blogger and user"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def estimate5(user_id, pin_id):\n",
      "    '''mean rating of same blogger by the user'''\n",
      "    user_condition = fashion_train.user_id == user_id\n",
      "    pin_blogger = items_info.ix[pin_id, 'blog_name']\n",
      "    pin_condition = fashion_train.blog_name == pin_blogger\n",
      "    ratings_by_user_bloggers = fashion_train.loc[user_condition & pin_condition]\n",
      "\n",
      "    if ratings_by_user_bloggers.empty:\n",
      "        return 1\n",
      "    else:\n",
      "        return ratings_by_user_bloggers.rating.mean() #mean value for that blogger from the specific user\n",
      "    \n",
      "print 'RMSE for estimate5: %s' % evaluate(estimate5)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "RMSE for estimate5: 0.813904952702\n"
       ]
      }
     ],
     "prompt_number": 23
    }
   ],
   "metadata": {}
  }
 ]
}