{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Building a fashion recommender (III): Content based recommender" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "__The explanation of this implementation can be found at: http://www.rosariomgomez.me/__
\n", "\n", "__Index__
\n", "1. [Build the training and testing sets](#1.-Build-the-training-and-testing-sets)
\n", "2. [Estimation functions](#2.-Estimation-functions)
\n", "3. [Content based recommendations](#3.-Content-based-recommendation-engine)
" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "1. Build the training and testing sets" ] }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "1.1. Build the dataframes" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "from create_features import create_pin_features, create_user_features" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "def get_db():\n", " from pymongo import MongoClient\n", " client = MongoClient('server', port) #server, port\n", " db = client.database_name #database name\n", " db.authenticate(\"user\", \"pwd\")\n", " return db" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "db = get_db()" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "#retrieve all the ratings from the DB\n", "rated_outfits = db.ratings.find()\n", "list_ratings = [rate for rate in rated_outfits]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "#retrieve all the users from the DB and build the feature vectors\n", "all_users = db.user.find()\n", "list_users = [create_user_features(user) for user in all_users]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "#retrieve all items from the DB and build the feature vectors\n", "all_pins = db.fullpin.find()\n", "list_pins = [create_pin_features(pin) for pin in all_pins]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "#build the pandas items dataframe\n", "items = pd.DataFrame(list_pins)\n", "items = items.rename(columns = {'_id':'pin_id'}) #to be in line with the ratings names\n", "items.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pin_idblog_nameblogger_ageblogger_body_shape_appleblogger_body_shape_hourglassblogger_body_shape_inverted_triangleblogger_body_shape_pearblogger_body_shape_rectangleblogger_dress_sizeblogger_styleblogger_style_bohemian chicblogger_style_casual chicblogger_style_classicblogger_style_edgyblogger_style_preppyblogger_style_romanticbrands_ASOSbrands_Abercrombie & Fitchbrands_Accessorizebrands_Alexander McQueen
0 537934c861e01f10f1118dea Hallie Daily 40 0 1 0 0 0 6 [classic, romantic] 0 0 1 0 0 1 0 0 0 0...
1 537934d261e01f10f1118e1b Hallie Daily 40 0 1 0 0 0 6 [classic] 0 0 1 0 0 0 0 0 0 0...
2 537934cc61e01f10f1118e00 Hallie Daily 40 0 1 0 0 0 6 [classic] 0 0 1 0 0 0 0 0 0 0...
3 537934c661e01f10f1118ddf Hallie Daily 40 0 1 0 0 0 6 [classic, romantic] 0 0 1 0 0 1 0 0 0 0...
4 537934c661e01f10f1118ddd Hallie Daily 40 0 1 0 0 0 6 [classic] 0 0 1 0 0 0 0 0 0 0...
\n", "

5 rows \u00d7 319 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " pin_id blog_name blogger_age \\\n", "0 537934c861e01f10f1118dea Hallie Daily 40 \n", "1 537934d261e01f10f1118e1b Hallie Daily 40 \n", "2 537934cc61e01f10f1118e00 Hallie Daily 40 \n", "3 537934c661e01f10f1118ddf Hallie Daily 40 \n", "4 537934c661e01f10f1118ddd Hallie Daily 40 \n", "\n", " blogger_body_shape_apple blogger_body_shape_hourglass \\\n", "0 0 1 \n", "1 0 1 \n", "2 0 1 \n", "3 0 1 \n", "4 0 1 \n", "\n", " blogger_body_shape_inverted_triangle blogger_body_shape_pear \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " blogger_body_shape_rectangle blogger_dress_size blogger_style \\\n", "0 0 6 [classic, romantic] \n", "1 0 6 [classic] \n", "2 0 6 [classic] \n", "3 0 6 [classic, romantic] \n", "4 0 6 [classic] \n", "\n", " blogger_style_bohemian chic blogger_style_casual chic \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " blogger_style_classic blogger_style_edgy blogger_style_preppy \\\n", "0 1 0 0 \n", "1 1 0 0 \n", "2 1 0 0 \n", "3 1 0 0 \n", "4 1 0 0 \n", "\n", " blogger_style_romantic brands_ASOS brands_Abercrombie & Fitch \\\n", "0 1 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 1 0 0 \n", "4 0 0 0 \n", "\n", " brands_Accessorize brands_Alexander McQueen \n", "0 0 0 ... \n", "1 0 0 ... \n", "2 0 0 ... \n", "3 0 0 ... \n", "4 0 0 ... \n", "\n", "[5 rows x 319 columns]" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "#build the ratings dataframe\n", "cols = ['user_id', 'pin_id', 'rating']\n", "ratings = pd.DataFrame(list_ratings, columns=cols)\n", "ratings.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idpin_idrating
0 538677f561e01f0be9e838f7 537933bb61e01f10f111886f 0
1 538677f561e01f0be9e838f7 537933a161e01f10f11187e6 0
2 538677f561e01f0be9e838f7 53793d6d61e01f10f111a725 2
3 538677f561e01f0be9e838f7 537933be61e01f10f111887f 2
4 538677f561e01f0be9e838f7 53793d8861e01f10f111a741 0
\n", "

5 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " user_id pin_id rating\n", "0 538677f561e01f0be9e838f7 537933bb61e01f10f111886f 0\n", "1 538677f561e01f0be9e838f7 537933a161e01f10f11187e6 0\n", "2 538677f561e01f0be9e838f7 53793d6d61e01f10f111a725 2\n", "3 538677f561e01f0be9e838f7 537933be61e01f10f111887f 2\n", "4 538677f561e01f0be9e838f7 53793d8861e01f10f111a741 0\n", "\n", "[5 rows x 3 columns]" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "#build the pandas users dataframe\n", "users = pd.DataFrame(list_users)\n", "users = users.rename(columns = {'_id':'user_id'}) #to be in line with the ratings names\n", "users.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idagecountryday_offdress_sizefashionistalike_styles_preflike_styles_pref_bohemian chiclike_styles_pref_casual chiclike_styles_pref_classiclike_styles_pref_edgylike_styles_pref_preppylike_styles_pref_romanticnolike_styles_prefnolike_styles_pref_bohemian chicnolike_styles_pref_casual chicnolike_styles_pref_classicnolike_styles_pref_edgynolike_styles_pref_preppynolike_styles_pref_romantic
0 53962dfa3191490008a690df 55 US sport 10 nolike [classic, casual chic, preppy] 0 1 1 0 1 0 [] 0 0 0 0 0 0...
1 53968e993d4e0c0007a2546f 50 US family 6 nolike [] 0 0 0 0 0 0 [edgy] 0 0 0 1 0 0...
2 53971683b7d85a0008b1bbe2 30 ES family 8 nolike [romantic, casual chic] 0 1 0 0 0 1 [edgy] 0 0 0 1 0 0...
3 539851a17f6ba70007ba8bdb 30 ES party 10 nolike [romantic, casual chic, preppy] 0 1 0 0 1 1 [edgy] 0 0 0 1 0 0...
4 539770c4a9a4570008c28a9b 45 ES family 10 ok [classic, casual chic, preppy] 0 1 1 0 1 0 [bohemian chic, edgy] 1 0 0 1 0 0...
\n", "

5 rows \u00d7 27 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ " user_id age country day_off dress_size fashionista \\\n", "0 53962dfa3191490008a690df 55 US sport 10 nolike \n", "1 53968e993d4e0c0007a2546f 50 US family 6 nolike \n", "2 53971683b7d85a0008b1bbe2 30 ES family 8 nolike \n", "3 539851a17f6ba70007ba8bdb 30 ES party 10 nolike \n", "4 539770c4a9a4570008c28a9b 45 ES family 10 ok \n", "\n", " like_styles_pref like_styles_pref_bohemian chic \\\n", "0 [classic, casual chic, preppy] 0 \n", "1 [] 0 \n", "2 [romantic, casual chic] 0 \n", "3 [romantic, casual chic, preppy] 0 \n", "4 [classic, casual chic, preppy] 0 \n", "\n", " like_styles_pref_casual chic like_styles_pref_classic \\\n", "0 1 1 \n", "1 0 0 \n", "2 1 0 \n", "3 1 0 \n", "4 1 1 \n", "\n", " like_styles_pref_edgy like_styles_pref_preppy like_styles_pref_romantic \\\n", "0 0 1 0 \n", "1 0 0 0 \n", "2 0 0 1 \n", "3 0 1 1 \n", "4 0 1 0 \n", "\n", " nolike_styles_pref nolike_styles_pref_bohemian chic \\\n", "0 [] 0 \n", "1 [edgy] 0 \n", "2 [edgy] 0 \n", "3 [edgy] 0 \n", "4 [bohemian chic, edgy] 1 \n", "\n", " nolike_styles_pref_casual chic nolike_styles_pref_classic \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", " nolike_styles_pref_edgy nolike_styles_pref_preppy \\\n", "0 0 0 \n", "1 1 0 \n", "2 1 0 \n", "3 1 0 \n", "4 1 0 \n", "\n", " nolike_styles_pref_romantic \n", "0 0 ... \n", "1 0 ... \n", "2 0 ... \n", "3 0 ... \n", "4 0 ... \n", "\n", "[5 rows x 27 columns]" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "#merge ratings, items and users dataframes\n", "fashion = pd.merge(pd.merge(ratings, users), items)\n", "fashion.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
user_idpin_idratingagecountryday_offdress_sizefashionistalike_styles_preflike_styles_pref_bohemian chiclike_styles_pref_casual chiclike_styles_pref_classiclike_styles_pref_edgylike_styles_pref_preppylike_styles_pref_romanticnolike_styles_prefnolike_styles_pref_bohemian chicnolike_styles_pref_casual chicnolike_styles_pref_classicnolike_styles_pref_edgy
0 538677f561e01f0be9e838f7 537933bb61e01f10f111886f 0 30 US sport 8 love [classic, casual chic, preppy] 0 1 1 0 1 0 [edgy] 0 0 0 1...
1 539604e40aa8e20007a976fb 537933bb61e01f10f111886f 2 30 ES read 8 ok [casual chic] 0 1 0 0 0 0 [] 0 0 0 0...
2 539616880aa8e20008b99e19 537933bb61e01f10f111886f 2 25 ES family 6 ok [casual chic, preppy] 0 1 0 0 1 0 [edgy] 0 0 0 1...
3 539628df3191490008a690d5 537933bb61e01f10f111886f 2 40 US sport 10 ok [classic] 0 0 1 0 0 0 [edgy] 0 0 0 1...
4 5396d3a6989960000db79381 537933bb61e01f10f111886f 1 30 ES party 10 nolike [classic, casual chic] 0 1 1 0 0 0 [bohemian chic, edgy] 1 0 0 1...
\n", "

5 rows \u00d7 347 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ " user_id pin_id rating age country \\\n", "0 538677f561e01f0be9e838f7 537933bb61e01f10f111886f 0 30 US \n", "1 539604e40aa8e20007a976fb 537933bb61e01f10f111886f 2 30 ES \n", "2 539616880aa8e20008b99e19 537933bb61e01f10f111886f 2 25 ES \n", "3 539628df3191490008a690d5 537933bb61e01f10f111886f 2 40 US \n", "4 5396d3a6989960000db79381 537933bb61e01f10f111886f 1 30 ES \n", "\n", " day_off dress_size fashionista like_styles_pref \\\n", "0 sport 8 love [classic, casual chic, preppy] \n", "1 read 8 ok [casual chic] \n", "2 family 6 ok [casual chic, preppy] \n", "3 sport 10 ok [classic] \n", "4 party 10 nolike [classic, casual chic] \n", "\n", " like_styles_pref_bohemian chic like_styles_pref_casual chic \\\n", "0 0 1 \n", "1 0 1 \n", "2 0 1 \n", "3 0 0 \n", "4 0 1 \n", "\n", " like_styles_pref_classic like_styles_pref_edgy like_styles_pref_preppy \\\n", "0 1 0 1 \n", "1 0 0 0 \n", "2 0 0 1 \n", "3 1 0 0 \n", "4 1 0 0 \n", "\n", " like_styles_pref_romantic nolike_styles_pref \\\n", "0 0 [edgy] \n", "1 0 [] \n", "2 0 [edgy] \n", "3 0 [edgy] \n", "4 0 [bohemian chic, edgy] \n", "\n", " nolike_styles_pref_bohemian chic nolike_styles_pref_casual chic \\\n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 1 0 \n", "\n", " nolike_styles_pref_classic nolike_styles_pref_edgy \n", "0 0 1 ... \n", "1 0 0 ... \n", "2 0 1 ... \n", "3 0 1 ... \n", "4 0 1 ... \n", "\n", "[5 rows x 347 columns]" ] } ], "prompt_number": 10 }, { "cell_type": "heading", "level": 4, "metadata": {}, "source": [ "1.2. Create the training and testing sets" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#20% of each user data for testing\n", "def assign_to_set(df):\n", " '''Randomly select 20% of indices from the dataframe and set the for_testing column to True\n", " Input: dataframe\n", " Output: dataframe'''\n", " np.random.seed(1)\n", " sampled_ids = np.random.choice(df.index, size=np.int64(np.ceil(df.index.size * 0.2)), replace=False)\n", " df.ix[sampled_ids, 'for_testing'] = True\n", " return df\n", "\n", "fashion['for_testing'] = False\n", "grouped = fashion.groupby('user_id', group_keys=False).apply(assign_to_set)\n", "fashion_train = fashion[grouped.for_testing == False]\n", "fashion_test = fashion[grouped.for_testing == True]\n", "print fashion.shape\n", "print fashion_train.shape\n", "print fashion_test.shape\n", "assert len(fashion_train.index & fashion_test.index) == 0 #ensure we don't have the same values on both sets" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "(2245, 348)\n", "(1783, 348)\n", "(462, 348)\n" ] } ], "prompt_number": 11 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "2. Estimation functions" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#using RMSE as performance criterion\n", "def compute_rmse(y_pred, y_true):\n", " '''Calculate the root mean square value between the predicted and true rating\n", " Input: predicted rating, true rating\n", " Output: RMSE'''\n", " return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "def evaluate(estimate_f):\n", " '''Calculate the RMSE to the passed as parameter recommendation function\n", " Input: function to predict the rating of a tuple (user, item)\n", " Output: (float) RMSE'''\n", " ids_to_estimate = zip(fashion_test.user_id, fashion_test.pin_id) #list of tuples (user_id, pin_id)\n", " estimated = np.array([estimate_f(u,p) for u,p in ids_to_estimate]) #apply the passed estimate function to the user,pin tuple\n", " real = fashion_test.rating.values\n", " return compute_rmse(estimated, real)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 13 }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "3. Content based recommendation engine" ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "3.1. Mean items rating by user" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def estimate1(user_id, pin_id):\n", " '''mean of user ratings'''\n", " user_condition = fashion_train.user_id == user_id\n", " return fashion_train.loc[user_condition, 'rating'].mean()\n", "\n", "print 'RMSE for estimate1: %s' % evaluate(estimate1)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RMSE for estimate1: 0.834525165107\n" ] } ], "prompt_number": 14 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "3.2. Mean items rating grouped by blogger" ] }, { "cell_type": "code", "collapsed": false, "input": [ "#new items pivot table where the index is the item_id for getting the user item blogger\n", "items_info = items.set_index('pin_id')\n", "means_by_blogger = fashion_train.pivot_table(values='rating', rows='pin_id', cols='blog_name')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "def estimate2(user_id, pin_id):\n", " '''mean rating of same blogger'''\n", " pin_blogger = items_info.ix[pin_id, 'blog_name']\n", "\n", " if pin_blogger in means_by_blogger.columns:\n", " return means_by_blogger.ix[:, pin_blogger].mean() #mean value for that blogger\n", " else:\n", " return 1\n", " \n", "print 'RMSE for estimate2: %s' % evaluate(estimate2)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RMSE for estimate2: 0.81514703353\n" ] } ], "prompt_number": 16 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "3.3. Mean items rating by item style" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def style_cond(pin_id):\n", " '''items with the same styles'''\n", " pin_styles = items_info.ix[pin_id, 'blogger_style']\n", " same_style_cond = True\n", " for style in pin_styles:\n", " same_style_cond = (same_style_cond) & (fashion_train['blogger_style_' + style] == 1)\n", " return same_style_cond" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "def estimate3(user_id, pin_id):\n", " '''mean rating of same pin style'''\n", " \n", " ratings_by_styles = fashion_train.loc[style_cond(pin_id)]\n", " \n", " if ratings_by_styles.empty:\n", " return 1\n", " else:\n", " return ratings_by_styles.rating.mean()\n", " \n", "print 'RMSE for estimate3: %s' % evaluate(estimate3)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RMSE for estimate3: 0.825515788148\n" ] } ], "prompt_number": 18 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "3.4. Mean items rating by item style and user" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def estimate4(user_id, pin_id):\n", " '''mean of the items with the same style rated by the user'''\n", " \n", " user_condition = fashion_train.user_id == user_id\n", " ratings_by_user_styles = fashion_train.loc[user_condition & style_cond(pin_id)]\n", " \n", " if ratings_by_user_styles.empty:\n", " return 1\n", " else:\n", " return ratings_by_user_styles.rating.mean()\n", " \n", "print 'RMSE for estimate4: %s' % evaluate(estimate4)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RMSE for estimate4: 0.840272347162\n" ] } ], "prompt_number": 20 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "3.5. Mean items rating by blogger and user" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def estimate5(user_id, pin_id):\n", " '''mean rating of same blogger by the user'''\n", " user_condition = fashion_train.user_id == user_id\n", " pin_blogger = items_info.ix[pin_id, 'blog_name']\n", " pin_condition = fashion_train.blog_name == pin_blogger\n", " ratings_by_user_bloggers = fashion_train.loc[user_condition & pin_condition]\n", "\n", " if ratings_by_user_bloggers.empty:\n", " return 1\n", " else:\n", " return ratings_by_user_bloggers.rating.mean() #mean value for that blogger from the specific user\n", " \n", "print 'RMSE for estimate5: %s' % evaluate(estimate5)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "RMSE for estimate5: 0.813904952702\n" ] } ], "prompt_number": 23 } ], "metadata": {} } ] }