{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CS579: Lecture 19 \n",
"\n",
"**Recommendation Systems**\n",
"\n",
"*[Dr. Aron Culotta](http://cs.iit.edu/~culotta)* \n",
"*[Illinois Institute of Technology](http://iit.edu)*"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Recommendation Systems, continued."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try out some of the ideas from last lecture on the [MovieLens](http://grouplens.org/datasets/movielens/) dataset."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import os\n",
"import pandas as pd\n",
"import urllib\n",
"import zipfile\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"# Download the data.\n",
"def download_data():\n",
" \"\"\" Download and unzip data.\n",
" DONE ALREADY.\n",
" \"\"\"\n",
" url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'\n",
" urllib.request.urlretrieve(url, 'ml-latest-small.zip')\n",
" zfile = zipfile.ZipFile('ml-latest-small.zip')\n",
" zfile.extractall()\n",
" zfile.close()\n",
" \n",
"download_data()\n",
"path = 'ml-latest-small'\n",
"ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')\n",
"movies = pd.read_csv(path + os.path.sep + 'movies.csv')\n",
"tags = pd.read_csv(path + os.path.sep + 'tags.csv')"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 31 | \n",
" 2.5 | \n",
" 1260759144 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1029 | \n",
" 3.0 | \n",
" 1260759179 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 1061 | \n",
" 3.0 | \n",
" 1260759182 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 31 2.5 1260759144\n",
"1 1 1029 3.0 1260759179\n",
"2 1 1061 3.0 1260759182"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children|Fantasy | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story (1995) \n",
"1 2 Jumanji (1995) \n",
"2 3 Grumpier Old Men (1995) \n",
"\n",
" genres \n",
"0 Adventure|Animation|Children|Comedy|Fantasy \n",
"1 Adventure|Children|Fantasy \n",
"2 Comedy|Romance "
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" tag | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 15 | \n",
" 339 | \n",
" sandra 'boring' bullock | \n",
" 1138537770 | \n",
"
\n",
" \n",
" | 1 | \n",
" 15 | \n",
" 1955 | \n",
" dentist | \n",
" 1193435061 | \n",
"
\n",
" \n",
" | 2 | \n",
" 15 | \n",
" 7478 | \n",
" Cambodia | \n",
" 1170560997 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userId movieId tag timestamp\n",
"0 15 339 sandra 'boring' bullock 1138537770\n",
"1 15 1955 dentist 1193435061\n",
"2 15 7478 Cambodia 1170560997"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 31 | \n",
" 2.5 | \n",
" 1260759144 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 1029 | \n",
" 3.0 | \n",
" 1260759179 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 1061 | \n",
" 3.0 | \n",
" 1260759182 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 1129 | \n",
" 2.0 | \n",
" 1260759185 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 1172 | \n",
" 4.0 | \n",
" 1260759205 | \n",
"
\n",
" \n",
" | 5 | \n",
" 1 | \n",
" 1263 | \n",
" 2.0 | \n",
" 1260759151 | \n",
"
\n",
" \n",
" | 6 | \n",
" 1 | \n",
" 1287 | \n",
" 2.0 | \n",
" 1260759187 | \n",
"
\n",
" \n",
" | 7 | \n",
" 1 | \n",
" 1293 | \n",
" 2.0 | \n",
" 1260759148 | \n",
"
\n",
" \n",
" | 8 | \n",
" 1 | \n",
" 1339 | \n",
" 3.5 | \n",
" 1260759125 | \n",
"
\n",
" \n",
" | 9 | \n",
" 1 | \n",
" 1343 | \n",
" 2.0 | \n",
" 1260759131 | \n",
"
\n",
" \n",
" | 10 | \n",
" 1 | \n",
" 1371 | \n",
" 2.5 | \n",
" 1260759135 | \n",
"
\n",
" \n",
" | 11 | \n",
" 1 | \n",
" 1405 | \n",
" 1.0 | \n",
" 1260759203 | \n",
"
\n",
" \n",
" | 12 | \n",
" 1 | \n",
" 1953 | \n",
" 4.0 | \n",
" 1260759191 | \n",
"
\n",
" \n",
" | 13 | \n",
" 1 | \n",
" 2105 | \n",
" 4.0 | \n",
" 1260759139 | \n",
"
\n",
" \n",
" | 14 | \n",
" 1 | \n",
" 2150 | \n",
" 3.0 | \n",
" 1260759194 | \n",
"
\n",
" \n",
" | 15 | \n",
" 1 | \n",
" 2193 | \n",
" 2.0 | \n",
" 1260759198 | \n",
"
\n",
" \n",
" | 16 | \n",
" 1 | \n",
" 2294 | \n",
" 2.0 | \n",
" 1260759108 | \n",
"
\n",
" \n",
" | 17 | \n",
" 1 | \n",
" 2455 | \n",
" 2.5 | \n",
" 1260759113 | \n",
"
\n",
" \n",
" | 18 | \n",
" 1 | \n",
" 2968 | \n",
" 1.0 | \n",
" 1260759200 | \n",
"
\n",
" \n",
" | 19 | \n",
" 1 | \n",
" 3671 | \n",
" 3.0 | \n",
" 1260759117 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp\n",
"0 1 31 2.5 1260759144\n",
"1 1 1029 3.0 1260759179\n",
"2 1 1061 3.0 1260759182\n",
"3 1 1129 2.0 1260759185\n",
"4 1 1172 4.0 1260759205\n",
"5 1 1263 2.0 1260759151\n",
"6 1 1287 2.0 1260759187\n",
"7 1 1293 2.0 1260759148\n",
"8 1 1339 3.5 1260759125\n",
"9 1 1343 2.0 1260759131\n",
"10 1 1371 2.5 1260759135\n",
"11 1 1405 1.0 1260759203\n",
"12 1 1953 4.0 1260759191\n",
"13 1 2105 4.0 1260759139\n",
"14 1 2150 3.0 1260759194\n",
"15 1 2193 2.0 1260759198\n",
"16 1 2294 2.0 1260759108\n",
"17 1 2455 2.5 1260759113\n",
"18 1 2968 1.0 1260759200\n",
"19 1 3671 3.0 1260759117"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings[ratings.userId==1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Let's use the item-item method to predict user 1's rating for movie 3671**"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" | 2925 | \n",
" 3671 | \n",
" Blazing Saddles (1974) | \n",
" Comedy|Western | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId title genres\n",
"2925 3671 Blazing Saddles (1974) Comedy|Western"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[movies.movieId==3671].iloc[0]['genres']"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many users are there?\n",
"user_ids = sorted(set(ratings.userId))\n",
"#len(user_ids)\n",
"user_ids[:10]"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# make user ids start at 0.\n",
"ratings['userId'] = ratings['userId'] - 1"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
]
},
"execution_count": 107,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many users are there?\n",
"user_ids = sorted(set(ratings.userId))\n",
"#len(user_ids)\n",
"user_ids[:10]"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" userId | \n",
" movieId | \n",
" rating | \n",
" timestamp | \n",
"
\n",
" \n",
" \n",
" \n",
" | 19 | \n",
" 0 | \n",
" 3671 | \n",
" 3.0 | \n",
" 1260759117 | \n",
"
\n",
" \n",
" | 1679 | \n",
" 14 | \n",
" 3671 | \n",
" 2.0 | \n",
" 1166586157 | \n",
"
\n",
" \n",
" | 4436 | \n",
" 22 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1149868554 | \n",
"
\n",
" \n",
" | 5761 | \n",
" 29 | \n",
" 3671 | \n",
" 4.0 | \n",
" 960918106 | \n",
"
\n",
" \n",
" | 8453 | \n",
" 55 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1467003357 | \n",
"
\n",
" \n",
" | 10809 | \n",
" 72 | \n",
" 3671 | \n",
" 3.0 | \n",
" 1255595938 | \n",
"
\n",
" \n",
" | 11981 | \n",
" 74 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1165596914 | \n",
"
\n",
" \n",
" | 12031 | \n",
" 75 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1194384277 | \n",
"
\n",
" \n",
" | 12239 | \n",
" 76 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1163079471 | \n",
"
\n",
" \n",
" | 13017 | \n",
" 82 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1156206112 | \n",
"
\n",
" \n",
" | 16005 | \n",
" 101 | \n",
" 3671 | \n",
" 4.0 | \n",
" 959975744 | \n",
"
\n",
" \n",
" | 18453 | \n",
" 119 | \n",
" 3671 | \n",
" 2.5 | \n",
" 1167422038 | \n",
"
\n",
" \n",
" | 18840 | \n",
" 124 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1269735510 | \n",
"
\n",
" \n",
" | 19659 | \n",
" 129 | \n",
" 3671 | \n",
" 2.0 | \n",
" 1149644335 | \n",
"
\n",
" \n",
" | 20255 | \n",
" 133 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1361244873 | \n",
"
\n",
" \n",
" | 21162 | \n",
" 147 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1059604766 | \n",
"
\n",
" \n",
" | 24820 | \n",
" 177 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1437427720 | \n",
"
\n",
" \n",
" | 26709 | \n",
" 194 | \n",
" 3671 | \n",
" 5.0 | \n",
" 976288624 | \n",
"
\n",
" \n",
" | 26848 | \n",
" 195 | \n",
" 3671 | \n",
" 5.0 | \n",
" 959223213 | \n",
"
\n",
" \n",
" | 28827 | \n",
" 211 | \n",
" 3671 | \n",
" 2.5 | \n",
" 1218954775 | \n",
"
\n",
" \n",
" | 31181 | \n",
" 221 | \n",
" 3671 | \n",
" 3.0 | \n",
" 960920305 | \n",
"
\n",
" \n",
" | 36982 | \n",
" 264 | \n",
" 3671 | \n",
" 5.0 | \n",
" 960060002 | \n",
"
\n",
" \n",
" | 39237 | \n",
" 284 | \n",
" 3671 | \n",
" 3.0 | \n",
" 965091993 | \n",
"
\n",
" \n",
" | 39772 | \n",
" 290 | \n",
" 3671 | \n",
" 5.0 | \n",
" 1111489095 | \n",
"
\n",
" \n",
" | 40676 | \n",
" 293 | \n",
" 3671 | \n",
" 3.0 | \n",
" 1062536588 | \n",
"
\n",
" \n",
" | 43028 | \n",
" 305 | \n",
" 3671 | \n",
" 4.0 | \n",
" 959197066 | \n",
"
\n",
" \n",
" | 43260 | \n",
" 308 | \n",
" 3671 | \n",
" 5.0 | \n",
" 1114567315 | \n",
"
\n",
" \n",
" | 44986 | \n",
" 314 | \n",
" 3671 | \n",
" 5.0 | \n",
" 1046663466 | \n",
"
\n",
" \n",
" | 47765 | \n",
" 351 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1420521398 | \n",
"
\n",
" \n",
" | 51493 | \n",
" 379 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1199154721 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 56105 | \n",
" 404 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1097698251 | \n",
"
\n",
" \n",
" | 58240 | \n",
" 422 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1355514296 | \n",
"
\n",
" \n",
" | 58423 | \n",
" 423 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1088826857 | \n",
"
\n",
" \n",
" | 59126 | \n",
" 427 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1304130933 | \n",
"
\n",
" \n",
" | 59401 | \n",
" 429 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1111489222 | \n",
"
\n",
" \n",
" | 60953 | \n",
" 441 | \n",
" 3671 | \n",
" 5.0 | \n",
" 1227968502 | \n",
"
\n",
" \n",
" | 62245 | \n",
" 451 | \n",
" 3671 | \n",
" 4.0 | \n",
" 976421517 | \n",
"
\n",
" \n",
" | 64006 | \n",
" 459 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1072837173 | \n",
"
\n",
" \n",
" | 66319 | \n",
" 467 | \n",
" 3671 | \n",
" 2.5 | \n",
" 1296192490 | \n",
"
\n",
" \n",
" | 67861 | \n",
" 471 | \n",
" 3671 | \n",
" 5.0 | \n",
" 958950037 | \n",
"
\n",
" \n",
" | 72067 | \n",
" 504 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1340406082 | \n",
"
\n",
" \n",
" | 72953 | \n",
" 508 | \n",
" 3671 | \n",
" 4.0 | \n",
" 978938266 | \n",
"
\n",
" \n",
" | 74831 | \n",
" 518 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1469927119 | \n",
"
\n",
" \n",
" | 76376 | \n",
" 528 | \n",
" 3671 | \n",
" 4.0 | \n",
" 959965342 | \n",
"
\n",
" \n",
" | 81283 | \n",
" 552 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1423010113 | \n",
"
\n",
" \n",
" | 82292 | \n",
" 560 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1172734423 | \n",
"
\n",
" \n",
" | 84549 | \n",
" 563 | \n",
" 3671 | \n",
" 5.0 | \n",
" 974712545 | \n",
"
\n",
" \n",
" | 85997 | \n",
" 574 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1012594537 | \n",
"
\n",
" \n",
" | 87014 | \n",
" 579 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1155617921 | \n",
"
\n",
" \n",
" | 88086 | \n",
" 584 | \n",
" 3671 | \n",
" 4.0 | \n",
" 975363032 | \n",
"
\n",
" \n",
" | 88440 | \n",
" 586 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1112034902 | \n",
"
\n",
" \n",
" | 90220 | \n",
" 597 | \n",
" 3671 | \n",
" 4.0 | \n",
" 1008571310 | \n",
"
\n",
" \n",
" | 90860 | \n",
" 603 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1277532575 | \n",
"
\n",
" \n",
" | 91255 | \n",
" 604 | \n",
" 3671 | \n",
" 4.0 | \n",
" 980175465 | \n",
"
\n",
" \n",
" | 92650 | \n",
" 614 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1468174876 | \n",
"
\n",
" \n",
" | 93369 | \n",
" 620 | \n",
" 3671 | \n",
" 5.0 | \n",
" 1116476740 | \n",
"
\n",
" \n",
" | 94105 | \n",
" 623 | \n",
" 3671 | \n",
" 5.0 | \n",
" 1019127174 | \n",
"
\n",
" \n",
" | 95945 | \n",
" 633 | \n",
" 3671 | \n",
" 3.5 | \n",
" 1309492285 | \n",
"
\n",
" \n",
" | 97871 | \n",
" 653 | \n",
" 3671 | \n",
" 4.5 | \n",
" 1145390260 | \n",
"
\n",
" \n",
" | 99965 | \n",
" 670 | \n",
" 3671 | \n",
" 3.0 | \n",
" 1065149267 | \n",
"
\n",
" \n",
"
\n",
"
62 rows × 4 columns
\n",
"
"
],
"text/plain": [
" userId movieId rating timestamp\n",
"19 0 3671 3.0 1260759117\n",
"1679 14 3671 2.0 1166586157\n",
"4436 22 3671 3.5 1149868554\n",
"5761 29 3671 4.0 960918106\n",
"8453 55 3671 4.0 1467003357\n",
"10809 72 3671 3.0 1255595938\n",
"11981 74 3671 4.0 1165596914\n",
"12031 75 3671 3.5 1194384277\n",
"12239 76 3671 4.0 1163079471\n",
"13017 82 3671 4.5 1156206112\n",
"16005 101 3671 4.0 959975744\n",
"18453 119 3671 2.5 1167422038\n",
"18840 124 3671 4.5 1269735510\n",
"19659 129 3671 2.0 1149644335\n",
"20255 133 3671 4.5 1361244873\n",
"21162 147 3671 4.0 1059604766\n",
"24820 177 3671 4.0 1437427720\n",
"26709 194 3671 5.0 976288624\n",
"26848 195 3671 5.0 959223213\n",
"28827 211 3671 2.5 1218954775\n",
"31181 221 3671 3.0 960920305\n",
"36982 264 3671 5.0 960060002\n",
"39237 284 3671 3.0 965091993\n",
"39772 290 3671 5.0 1111489095\n",
"40676 293 3671 3.0 1062536588\n",
"43028 305 3671 4.0 959197066\n",
"43260 308 3671 5.0 1114567315\n",
"44986 314 3671 5.0 1046663466\n",
"47765 351 3671 4.0 1420521398\n",
"51493 379 3671 4.0 1199154721\n",
"... ... ... ... ...\n",
"56105 404 3671 4.0 1097698251\n",
"58240 422 3671 3.5 1355514296\n",
"58423 423 3671 3.5 1088826857\n",
"59126 427 3671 4.5 1304130933\n",
"59401 429 3671 4.5 1111489222\n",
"60953 441 3671 5.0 1227968502\n",
"62245 451 3671 4.0 976421517\n",
"64006 459 3671 4.0 1072837173\n",
"66319 467 3671 2.5 1296192490\n",
"67861 471 3671 5.0 958950037\n",
"72067 504 3671 3.5 1340406082\n",
"72953 508 3671 4.0 978938266\n",
"74831 518 3671 4.5 1469927119\n",
"76376 528 3671 4.0 959965342\n",
"81283 552 3671 3.5 1423010113\n",
"82292 560 3671 4.5 1172734423\n",
"84549 563 3671 5.0 974712545\n",
"85997 574 3671 4.0 1012594537\n",
"87014 579 3671 4.0 1155617921\n",
"88086 584 3671 4.0 975363032\n",
"88440 586 3671 4.0 1112034902\n",
"90220 597 3671 4.0 1008571310\n",
"90860 603 3671 3.5 1277532575\n",
"91255 604 3671 4.0 980175465\n",
"92650 614 3671 3.5 1468174876\n",
"93369 620 3671 5.0 1116476740\n",
"94105 623 3671 5.0 1019127174\n",
"95945 633 3671 3.5 1309492285\n",
"97871 653 3671 4.5 1145390260\n",
"99965 670 3671 3.0 1065149267\n",
"\n",
"[62 rows x 4 columns]"
]
},
"execution_count": 111,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# What are the ratings for 3671?\n",
"ratings[ratings.movieId==3671]"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:6: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
]
},
{
"data": {
"text/plain": [
"array([ 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 2. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. , 4. , 3.5, 4. ,\n",
" 0. , 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. ,\n",
" 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 2. , 0. , 0. ,\n",
" 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 5. , 5. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 2.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 0. ,\n",
" 0. , 0. , 0. , 0. , 5. , 0. , 0. , 3. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. ,\n",
" 5. , 0. , 0. , 0. , 0. , 0. , 5. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 3. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 3.5, 3.5, 0. , 0. , 0. , 4.5, 0. ,\n",
" 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. , 5. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. ,\n",
" 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4.5,\n",
" 0. , 0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 4. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. ,\n",
" 0. , 4. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 0. , 3.5, 4. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. ,\n",
" 0. , 0. , 0. , 0. , 5. , 0. , 0. , 5. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. ])"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Get the ratings from all users assigned to movie 3671\n",
"# Store ratings in a numpy array with dimension equal to number of users.\n",
"target_movie_id = 3671\n",
"target_movie_vector = np.zeros(len(user_ids))\n",
"for index, row in ratings[ratings.movieId==3671].iterrows():\n",
" target_movie_vector[row.userId] = row.rating\n",
"# Remove target user's rating for this movie:\n",
"target_movie_vector[0] = 0\n",
"target_movie_vector"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[4 5 5 2]\n",
"[3 4 5 1]\n"
]
},
{
"data": {
"text/plain": [
"0.96609178307929588"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.stats import pearsonr\n",
"# Correlation between two item vectors (e.g., all ratings given to movie j)\n",
"def correlation(v1, v2):\n",
" indices = [i for i in range(len(v1)) if v1[i] != 0 and v2[i] != 0]\n",
" print(v1[indices])\n",
" print(v2[indices])\n",
" if len(indices) < 2:\n",
" return 0\n",
" else:\n",
" return pearsonr(v1[indices], v2[indices])[0]\n",
" \n",
" \n",
"correlation(np.array([0,4,0,5,0,5,2]),\n",
" np.array([4,3,0,4,0,5,1]))"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[4 5 5 2]\n",
"[5 4 4 5]\n"
]
},
{
"data": {
"text/plain": [
"-0.81649658092772615"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"correlation(np.array([0,4,0,5,0,5,2]),\n",
" np.array([4,5,0,4,0,4,5])) # change second vector "
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0.63592247307435135, 2294.0), (0.61616638738992613, 1287.0), (0.53082272889989246, 2455.0), (0.39498658934488512, 1343.0), (0.37922646140545141, 1029.0), (0.36838579111178049, 1953.0), (0.33029706730581576, 1405.0), (0.3273268353539886, 31.0), (0.29746710191544001, 2193.0), (0.28743499113013798, 1293.0)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:10: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
]
}
],
"source": [
"# For every other movie that user 1 rated, compute its correlation with movie 3671\n",
"correlations = [] # (correlation, movieId) tuples\n",
"for index, row in ratings[ratings.userId==0].iterrows(): # for each movie this user has rated.\n",
" if row.movieId != 3671: # ignore Blazing Saddles\n",
" movie = movies[movies.movieId==row.movieId].iloc[0] # iloc: to get index of \n",
" # print(movie['title'])\n",
" movie_vector = np.zeros(len(user_ids))\n",
" # get all user ratings for this title.\n",
" for j, row2 in ratings[ratings.movieId==row.movieId].iterrows():\n",
" movie_vector[row2.userId] = row2.rating\n",
" corr = correlation(target_movie_vector, movie_vector)\n",
" correlations.append((corr, row.movieId))\n",
" \n",
"print(sorted(correlations)[::-1][:10]) "
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1041 | \n",
" 1287 | \n",
" Ben-Hur (1959) | \n",
" Action|Adventure|Drama | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId title genres\n",
"1041 1287 Ben-Hur (1959) Action|Adventure|Drama"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[movies.movieId==1287]"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/awculott/.local/lib/python3.5/site-packages/ipykernel/__main__.py:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n"
]
},
{
"data": {
"text/plain": [
"[2.0, 2.0, 2.5, 2.0, 3.0]"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now, take top K movies and do weighted average to compute predicted score.\n",
"K = 5\n",
"top_movies = sorted(correlations)[::-1][:K]\n",
"top_movie_ids = [int(x[1]) for x in top_movies]\n",
"top_movie_corrs = [x[0] for x in top_movies]\n",
"# get target user's ratings for these movies:\n",
"top_ratings = [ratings[ratings.userId==0][ratings.movieId == tmid]['rating'].iloc[0] for tmid in top_movie_ids]\n",
"top_ratings"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"2.2520948004421606"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# weighted average:\n",
"np.dot(np.array(top_ratings), np.array(top_movie_corrs)) / sum(top_movie_corrs)\n",
"\n",
"# True rating: 3.0"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}