{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CS579: Lecture 19 \n", "\n", "**Recommendation Systems**\n", "\n", "*[Dr. Aron Culotta](http://cs.iit.edu/~culotta)* \n", "*[Illinois Institute of Technology](http://iit.edu)*" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Recommendation Systems, continued." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's try out some of the ideas from last lecture on the [MovieLens](http://grouplens.org/datasets/movielens/) dataset." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import os\n", "import pandas as pd\n", "import urllib\n", "import zipfile\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [], "source": [ "# Download the data.\n", "def download_data():\n", " \"\"\" Download and unzip data.\n", " DONE ALREADY.\n", " \"\"\"\n", " url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'\n", " urllib.request.urlretrieve(url, 'ml-latest-small.zip')\n", " zfile = zipfile.ZipFile('ml-latest-small.zip')\n", " zfile.extractall()\n", " zfile.close()\n", " \n", "download_data()\n", "path = 'ml-latest-small'\n", "ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')\n", "movies = pd.read_csv(path + os.path.sep + 'movies.csv')\n", "tags = pd.read_csv(path + os.path.sep + 'tags.csv')" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings.head(3)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
\n", "
" ], "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "\n", " genres \n", "0 Adventure|Animation|Children|Comedy|Fantasy \n", "1 Adventure|Children|Fantasy \n", "2 Comedy|Romance " ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head(3)" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdtagtimestamp
015339sandra 'boring' bullock1138537770
1151955dentist1193435061
2157478Cambodia1170560997
\n", "
" ], "text/plain": [ " userId movieId tag timestamp\n", "0 15 339 sandra 'boring' bullock 1138537770\n", "1 15 1955 dentist 1193435061\n", "2 15 7478 Cambodia 1170560997" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags.head(3)" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
5112632.01260759151
6112872.01260759187
7112932.01260759148
8113393.51260759125
9113432.01260759131
10113712.51260759135
11114051.01260759203
12119534.01260759191
13121054.01260759139
14121503.01260759194
15121932.01260759198
16122942.01260759108
17124552.51260759113
18129681.01260759200
19136713.01260759117
\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205\n", "5 1 1263 2.0 1260759151\n", "6 1 1287 2.0 1260759187\n", "7 1 1293 2.0 1260759148\n", "8 1 1339 3.5 1260759125\n", "9 1 1343 2.0 1260759131\n", "10 1 1371 2.5 1260759135\n", "11 1 1405 1.0 1260759203\n", "12 1 1953 4.0 1260759191\n", "13 1 2105 4.0 1260759139\n", "14 1 2150 3.0 1260759194\n", "15 1 2193 2.0 1260759198\n", "16 1 2294 2.0 1260759108\n", "17 1 2455 2.5 1260759113\n", "18 1 2968 1.0 1260759200\n", "19 1 3671 3.0 1260759117" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings[ratings.userId==1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Let's use the item-item method to predict user 1's rating for movie 3671**" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
29253671Blazing Saddles (1974)Comedy|Western
\n", "
" ], "text/plain": [ " movieId title genres\n", "2925 3671 Blazing Saddles (1974) Comedy|Western" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies[movies.movieId==3671].iloc[0]['genres']" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# how many users are there?\n", "user_ids = sorted(set(ratings.userId))\n", "#len(user_ids)\n", "user_ids[:10]" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# make user ids start at 0.\n", "ratings['userId'] = ratings['userId'] - 1" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]" ] }, "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# how many users are there?\n", "user_ids = sorted(set(ratings.userId))\n", "#len(user_ids)\n", "user_ids[:10]" ] }, { "cell_type": "code", "execution_count": 111, "metadata": { "collapsed": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
19036713.01260759117
16791436712.01166586157
44362236713.51149868554
57612936714.0960918106
84535536714.01467003357
108097236713.01255595938
119817436714.01165596914
120317536713.51194384277
122397636714.01163079471
130178236714.51156206112
1600510136714.0959975744
1845311936712.51167422038
1884012436714.51269735510
1965912936712.01149644335
2025513336714.51361244873
2116214736714.01059604766
2482017736714.01437427720
2670919436715.0976288624
2684819536715.0959223213
2882721136712.51218954775
3118122136713.0960920305
3698226436715.0960060002
3923728436713.0965091993
3977229036715.01111489095
4067629336713.01062536588
4302830536714.0959197066
4326030836715.01114567315
4498631436715.01046663466
4776535136714.01420521398
5149337936714.01199154721
...............
5610540436714.01097698251
5824042236713.51355514296
5842342336713.51088826857
5912642736714.51304130933
5940142936714.51111489222
6095344136715.01227968502
6224545136714.0976421517
6400645936714.01072837173
6631946736712.51296192490
6786147136715.0958950037
7206750436713.51340406082
7295350836714.0978938266
7483151836714.51469927119
7637652836714.0959965342
8128355236713.51423010113
8229256036714.51172734423
8454956336715.0974712545
8599757436714.01012594537
8701457936714.01155617921
8808658436714.0975363032
8844058636714.01112034902
9022059736714.01008571310
9086060336713.51277532575
9125560436714.0980175465
9265061436713.51468174876
9336962036715.01116476740
9410562336715.01019127174
9594563336713.51309492285
9787165336714.51145390260
9996567036713.01065149267
\n", "

62 rows × 4 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "19 0 3671 3.0 1260759117\n", "1679 14 3671 2.0 1166586157\n", "4436 22 3671 3.5 1149868554\n", "5761 29 3671 4.0 960918106\n", "8453 55 3671 4.0 1467003357\n", "10809 72 3671 3.0 1255595938\n", "11981 74 3671 4.0 1165596914\n", "12031 75 3671 3.5 1194384277\n", "12239 76 3671 4.0 1163079471\n", "13017 82 3671 4.5 1156206112\n", "16005 101 3671 4.0 959975744\n", "18453 119 3671 2.5 1167422038\n", "18840 124 3671 4.5 1269735510\n", "19659 129 3671 2.0 1149644335\n", "20255 133 3671 4.5 1361244873\n", "21162 147 3671 4.0 1059604766\n", "24820 177 3671 4.0 1437427720\n", "26709 194 3671 5.0 976288624\n", "26848 195 3671 5.0 959223213\n", "28827 211 3671 2.5 1218954775\n", "31181 221 3671 3.0 960920305\n", "36982 264 3671 5.0 960060002\n", "39237 284 3671 3.0 965091993\n", "39772 290 3671 5.0 1111489095\n", "40676 293 3671 3.0 1062536588\n", "43028 305 3671 4.0 959197066\n", "43260 308 3671 5.0 1114567315\n", "44986 314 3671 5.0 1046663466\n", "47765 351 3671 4.0 1420521398\n", "51493 379 3671 4.0 1199154721\n", "... ... ... ... ...\n", "56105 404 3671 4.0 1097698251\n", "58240 422 3671 3.5 1355514296\n", "58423 423 3671 3.5 1088826857\n", "59126 427 3671 4.5 1304130933\n", "59401 429 3671 4.5 1111489222\n", "60953 441 3671 5.0 1227968502\n", "62245 451 3671 4.0 976421517\n", "64006 459 3671 4.0 1072837173\n", "66319 467 3671 2.5 1296192490\n", "67861 471 3671 5.0 958950037\n", "72067 504 3671 3.5 1340406082\n", "72953 508 3671 4.0 978938266\n", "74831 518 3671 4.5 1469927119\n", "76376 528 3671 4.0 959965342\n", "81283 552 3671 3.5 1423010113\n", "82292 560 3671 4.5 1172734423\n", "84549 563 3671 5.0 974712545\n", "85997 574 3671 4.0 1012594537\n", "87014 579 3671 4.0 1155617921\n", "88086 584 3671 4.0 975363032\n", "88440 586 3671 4.0 1112034902\n", "90220 597 3671 4.0 1008571310\n", "90860 603 3671 3.5 1277532575\n", "91255 604 3671 4.0 980175465\n", "92650 614 3671 3.5 1468174876\n", "93369 620 3671 5.0 1116476740\n", "94105 623 3671 5.0 1019127174\n", "95945 633 3671 3.5 1309492285\n", "97871 653 3671 4.5 1145390260\n", "99965 670 3671 3.0 1065149267\n", "\n", "[62 rows x 4 columns]" ] }, "execution_count": 111, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# What are the ratings for 3671?\n", "ratings[ratings.movieId==3671]" ] }, { "cell_type": "code", "execution_count": 112, "metadata": { "collapsed": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:6: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n" ] }, { "data": { "text/plain": [ "array([ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 2. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. , 4. , 3.5, 4. ,\n", " 0. , 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. ,\n", " 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 2. , 0. , 0. ,\n", " 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 5. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 2.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. ,\n", " 0. , 0. , 0. , 0. , 5. , 0. , 0. , 3. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. ,\n", " 5. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 3.5, 3.5, 0. , 0. , 0. , 4.5, 0. ,\n", " 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. , 5. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. ,\n", " 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4.5,\n", " 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 4. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. ,\n", " 0. , 4. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 3.5, 4. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. ,\n", " 0. , 0. , 0. , 0. , 5. , 0. , 0. , 5. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ,\n", " 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. ])" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the ratings from all users assigned to movie 3671\n", "# Store ratings in a numpy array with dimension equal to number of users.\n", "target_movie_id = 3671\n", "target_movie_vector = np.zeros(len(user_ids))\n", "for index, row in ratings[ratings.movieId==3671].iterrows():\n", " target_movie_vector[row.userId] = row.rating\n", "# Remove target user's rating for this movie:\n", "target_movie_vector[0] = 0\n", "target_movie_vector" ] }, { "cell_type": "code", "execution_count": 118, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[4 5 5 2]\n", "[3 4 5 1]\n" ] }, { "data": { "text/plain": [ "0.96609178307929588" ] }, "execution_count": 118, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.stats import pearsonr\n", "# Correlation between two item vectors (e.g., all ratings given to movie j)\n", "def correlation(v1, v2):\n", " indices = [i for i in range(len(v1)) if v1[i] != 0 and v2[i] != 0]\n", " print(v1[indices])\n", " print(v2[indices])\n", " if len(indices) < 2:\n", " return 0\n", " else:\n", " return pearsonr(v1[indices], v2[indices])[0]\n", " \n", " \n", "correlation(np.array([0,4,0,5,0,5,2]),\n", " np.array([4,3,0,4,0,5,1]))" ] }, { "cell_type": "code", "execution_count": 119, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[4 5 5 2]\n", "[5 4 4 5]\n" ] }, { "data": { "text/plain": [ "-0.81649658092772615" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "correlation(np.array([0,4,0,5,0,5,2]),\n", " np.array([4,5,0,4,0,4,5])) # change second vector " ] }, { "cell_type": "code", "execution_count": 114, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0.63592247307435135, 2294.0), (0.61616638738992613, 1287.0), (0.53082272889989246, 2455.0), (0.39498658934488512, 1343.0), (0.37922646140545141, 1029.0), (0.36838579111178049, 1953.0), (0.33029706730581576, 1405.0), (0.3273268353539886, 31.0), (0.29746710191544001, 2193.0), (0.28743499113013798, 1293.0)]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:10: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n" ] } ], "source": [ "# For every other movie that user 1 rated, compute its correlation with movie 3671\n", "correlations = [] # (correlation, movieId) tuples\n", "for index, row in ratings[ratings.userId==0].iterrows(): # for each movie this user has rated.\n", " if row.movieId != 3671: # ignore Blazing Saddles\n", " movie = movies[movies.movieId==row.movieId].iloc[0] # iloc: to get index of \n", " # print(movie['title'])\n", " movie_vector = np.zeros(len(user_ids))\n", " # get all user ratings for this title.\n", " for j, row2 in ratings[ratings.movieId==row.movieId].iterrows():\n", " movie_vector[row2.userId] = row2.rating\n", " corr = correlation(target_movie_vector, movie_vector)\n", " correlations.append((corr, row.movieId))\n", " \n", "print(sorted(correlations)[::-1][:10]) " ] }, { "cell_type": "code", "execution_count": 116, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
10411287Ben-Hur (1959)Action|Adventure|Drama
\n", "
" ], "text/plain": [ " movieId title genres\n", "1041 1287 Ben-Hur (1959) Action|Adventure|Drama" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies[movies.movieId==1287]" ] }, { "cell_type": "code", "execution_count": 120, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n" ] }, { "data": { "text/plain": [ "[2.0, 2.0, 2.5, 2.0, 3.0]" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now, take top K movies and do weighted average to compute predicted score.\n", "K = 5\n", "top_movies = sorted(correlations)[::-1][:K]\n", "top_movie_ids = [int(x[1]) for x in top_movies]\n", "top_movie_corrs = [x[0] for x in top_movies]\n", "# get target user's ratings for these movies:\n", "top_ratings = [ratings[ratings.userId==0][ratings.movieId == tmid]['rating'].iloc[0] for tmid in top_movie_ids]\n", "top_ratings" ] }, { "cell_type": "code", "execution_count": 121, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2.2520948004421606" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# weighted average:\n", "np.dot(np.array(top_ratings), np.array(top_movie_corrs)) / sum(top_movie_corrs)\n", "\n", "# True rating: 3.0" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.0" } }, "nbformat": 4, "nbformat_minor": 0 }