{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#简单的user-based协同过滤算法示例代码\n", "#七月算法:寒老师\n", "#2016-03-26\n", "\n", "#构造一份打分数据集,可以去movielens下载真实的数据做实验\n", "users = {\"小明\": {\"中国合伙人\": 5.0, \"太平轮\": 3.0, \"荒野猎人\": 4.5, \"老炮儿\": 5.0, \"我的少女时代\": 3.0, \"肖洛特烦恼\": 4.5, \"火星救援\": 5.0},\n", " \"小红\":{\"小时代4\": 4.0, \"荒野猎人\": 3.0, \"我的少女时代\": 5.0, \"肖洛特烦恼\": 5.0, \"火星救援\": 3.0, \"后会无期\": 3.0},\n", " \"小阳\": {\"小时代4\": 2.0, \"中国合伙人\": 5.0, \"我的少女时代\": 3.0, \"老炮儿\": 5.0, \"肖洛特烦恼\": 4.5, \"速度与激情7\": 5.0},\n", " \"小四\": {\"小时代4\": 5.0, \"中国合伙人\": 3.0, \"我的少女时代\": 4.0, \"匆匆那年\": 4.0, \"速度与激情7\": 3.5, \"火星救援\": 3.5, \"后会无期\": 4.5},\n", " \"六爷\": {\"小时代4\": 2.0, \"中国合伙人\": 4.0, \"荒野猎人\": 4.5, \"老炮儿\": 5.0, \"我的少女时代\": 2.0},\n", " \"小李\": {\"荒野猎人\": 5.0, \"盗梦空间\": 5.0, \"我的少女时代\": 3.0, \"速度与激情7\": 5.0, \"蚁人\": 4.5, \"老炮儿\": 4.0, \"后会无期\": 3.5},\n", " \"隔壁老王\": {\"荒野猎人\": 5.0, \"中国合伙人\": 4.0, \"我的少女时代\": 1.0, \"Phoenix\": 5.0, \"甄嬛传\": 4.0, \"The Strokes\": 5.0},\n", " \"邻村小芳\": {\"小时代4\": 4.0, \"我的少女时代\": 4.5, \"匆匆那年\": 4.5, \"甄嬛传\": 2.5, \"The Strokes\": 3.0}\n", " }" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#定义几种距离计算函数\n", "#更高效的方式为把得分向量化之后使用scipy中定义的distance方法\n", "\n", "from math import sqrt\n", "def euclidean_dis(rating1, rating2):\n", " \"\"\"计算2个打分序列间的欧式距离. 输入的rating1和rating2都是打分dict\n", " 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}\"\"\"\n", " distance = 0\n", " commonRatings = False \n", " for key in rating1:\n", " if key in rating2:\n", " distance += (rating1[key] - rating2[key])^2\n", " commonRatings = True\n", " #两个打分序列之间有公共打分电影\n", " if commonRatings:\n", " return distance\n", " #无公共打分电影\n", " else:\n", " return -1\n", "\n", "\n", "def manhattan_dis(rating1, rating2):\n", " \"\"\"计算2个打分序列间的曼哈顿距离. 输入的rating1和rating2都是打分dict\n", " 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}\"\"\"\n", " distance = 0\n", " commonRatings = False \n", " for key in rating1:\n", " if key in rating2:\n", " distance += abs(rating1[key] - rating2[key])\n", " commonRatings = True\n", " #两个打分序列之间有公共打分电影\n", " if commonRatings:\n", " return distance\n", " #无公共打分电影\n", " else:\n", " return -1\n", "\n", "def cos_dis(rating1, rating2):\n", " \"\"\"计算2个打分序列间的cos距离. 输入的rating1和rating2都是打分dict\n", " 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}\"\"\"\n", " distance = 0\n", " dot_product_1 = 0\n", " dot_product_2 = 0\n", " commonRatings = False\n", " \n", " for score in rating1.values():\n", " dot_product_1 += score^2\n", " for score in rating2.values():\n", " dot_product_2 += score^2\n", " \n", " for key in rating1:\n", " if key in rating2:\n", " distance += rating1[key] * rating2[key]\n", " commonRatings = True\n", " #两个打分序列之间有公共打分电影\n", " if commonRatings:\n", " return 1-distance/sqrt(dot_product_1*dot_product_2)\n", " #无公共打分电影\n", " else:\n", " return -1\n", "\n", "def pearson_dis(rating1, rating2):\n", " \"\"\"计算2个打分序列间的pearson距离. 输入的rating1和rating2都是打分dict\n", " 格式为{'小时代4': 1.0, '疯狂动物城': 5.0}\"\"\"\n", " sum_xy = 0\n", " sum_x = 0\n", " sum_y = 0\n", " sum_x2 = 0\n", " sum_y2 = 0\n", " n = 0\n", " for key in rating1:\n", " if key in rating2:\n", " n += 1\n", " x = rating1[key]\n", " y = rating2[key]\n", " sum_xy += x * y\n", " sum_x += x\n", " sum_y += y\n", " sum_x2 += pow(x, 2)\n", " sum_y2 += pow(y, 2)\n", " # now compute denominator\n", " denominator = sqrt(sum_x2 - pow(sum_x, 2) / n) * sqrt(sum_y2 - pow(sum_y, 2) / n)\n", " if denominator == 0:\n", " return 0\n", " else:\n", " return (sum_xy - (sum_x * sum_y) / n) / denominator" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#查找最近邻\n", "def computeNearestNeighbor(username, users):\n", " \"\"\"在给定username的情况下,计算其他用户和它的距离并排序\"\"\"\n", " distances = []\n", " for user in users:\n", " if user != username:\n", " #distance = manhattan_dis(users[user], users[username])\n", " distance = pearson_dis(users[user], users[username])\n", " distances.append((distance, user))\n", " # 根据距离排序,距离越近,排得越靠前\n", " distances.sort()\n", " return distances\n", "\n", "#推荐\n", "def recommend(username, users):\n", " \"\"\"对指定的user推荐电影\"\"\"\n", " # 找到最近邻\n", " nearest = computeNearestNeighbor(username, users)[0][1]\n", "\n", " recommendations = []\n", " # 找到最近邻看过,但是我们没看过的电影,计算推荐\n", " neighborRatings = users[nearest]\n", " userRatings = users[username]\n", " for artist in neighborRatings:\n", " if not artist in userRatings:\n", " recommendations.append((artist, neighborRatings[artist]))\n", " results = sorted(recommendations, key=lambda artistTuple: artistTuple[1], reverse = True)\n", " for result in results:\n", " print result[0], result[1]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "肖洛特烦恼 5.0\n", "后会无期 3.0\n", "火星救援 3.0\n" ] } ], "source": [ "recommend('六爷', users)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#简单的张量分解进行打分和推荐\n", "#要用到numpy模块\n", "import numpy\n", "\n", "#手写矩阵分解\n", "#现在有很多很方便对高维矩阵做分解的package,比如libmf, svdfeature等\n", "def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):\n", " Q = Q.T\n", " for step in xrange(steps):\n", " for i in xrange(len(R)):\n", " for j in xrange(len(R[i])):\n", " if R[i][j] > 0:\n", " eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])\n", " for k in xrange(K):\n", " P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])\n", " Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])\n", " eR = numpy.dot(P,Q)\n", " e = 0\n", " for i in xrange(len(R)):\n", " for j in xrange(len(R[i])):\n", " if R[i][j] > 0:\n", " e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)\n", " for k in xrange(K):\n", " e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))\n", " if e < 0.001:\n", " break\n", " return P, Q.T" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#读取user数据并用张量分解进行打分\n", "\n", "R = [\n", " [5,3,0,1],\n", " [4,0,3,1],\n", " [1,1,0,5],\n", " [1,0,0,4],\n", " [0,1,5,4],\n", " ]\n", "\n", "R = numpy.array(R)\n", "\n", "N = len(R)\n", "M = len(R[0])\n", "K = 2\n", "\n", "P = numpy.random.rand(N,K)\n", "Q = numpy.random.rand(M,K)\n", "\n", "nP, nQ = matrix_factorization(R, P, Q, K)\n", "nR = numpy.dot(nP, nQ.T)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0.38345373, 2.181972 ],\n", " [ 0.32334816, 1.56283276],\n", " [ 1.99170613, 0.16400981],\n", " [ 1.59666903, 0.14124969],\n", " [ 1.64308192, 1.07125805]])" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nP" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 0.38946426, 2.29198167],\n", " [ 0.19720283, 1.18916254],\n", " [ 1.71589715, 1.76060186],\n", " [ 2.48314488, 0.03019937]])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nQ" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 5.15038133, 2.67033753, 4.49955112, 1.01806534],\n", " [ 3.70791658, 1.92222735, 3.30635845, 0.85011689],\n", " [ 1.15160585, 0.58780442, 3.70631887, 4.95064787],\n", " [ 0.94558722, 0.48283649, 2.98840431, 3.96902618],\n", " [ 3.0952255 , 1.59792036, 4.70541851, 4.11236178]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nR" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[5, 3, 0, 1],\n", " [4, 0, 3, 1],\n", " [1, 1, 0, 5],\n", " [1, 0, 0, 4],\n", " [0, 1, 5, 4]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "R" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }