{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# data \"ml-100k\" from http://grouplens.org/datasets/movielens/" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "data_folder = os.path.join(\".\", \"data\")\n", "ratings_filename = os.path.join(data_folder, \"u.data\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserIDMovieIDRatingDatetime
019624231997-12-04 15:55:49
118630231998-04-04 19:22:22
22237711997-11-07 07:18:36
32445121997-11-27 05:02:03
416634611998-02-02 05:33:16
\n", "
" ], "text/plain": [ " UserID MovieID Rating Datetime\n", "0 196 242 3 1997-12-04 15:55:49\n", "1 186 302 3 1998-04-04 19:22:22\n", "2 22 377 1 1997-11-07 07:18:36\n", "3 244 51 2 1997-11-27 05:02:03\n", "4 166 346 1 1998-02-02 05:33:16" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_ratings = pd.read_csv(ratings_filename, delimiter=\"\\t\", header=None, names = [\"UserID\", \"MovieID\", \"Rating\", \"Datetime\"])\n", "all_ratings[\"Datetime\"] = pd.to_datetime(all_ratings['Datetime'],unit='s')\n", "all_ratings[:5]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserIDMovieIDRatingDatetime
810986758641998-03-10 00:26:14
9069667522311998-03-10 00:35:51
9265067523511998-03-10 00:35:51
9545967524241998-03-10 00:08:42
8284567524431998-03-10 00:29:35
5329367525831998-03-10 00:11:19
9728667526951998-03-10 00:08:07
9372067527231998-03-10 00:07:11
7338967528641998-03-10 00:07:11
7752467530351998-03-10 00:08:42
4736767530541998-03-10 00:09:08
4430067530651998-03-10 00:08:07
5373067531131998-03-10 00:10:47
5428467531221998-03-10 00:10:24
6329167531851998-03-10 00:21:13
8708267532121998-03-10 00:11:48
5610867534441998-03-10 00:12:34
5304667534741998-03-10 00:07:11
9461767542751998-03-10 00:28:11
6991567546351998-03-10 00:16:43
4674467550951998-03-10 00:24:25
4659867553151998-03-10 00:18:28
5296267565051998-03-10 00:32:51
9402967575041998-03-10 00:08:07
5322367587441998-03-10 00:11:19
6227767589121998-03-10 00:12:59
7727467589651998-03-10 00:09:35
6619467590041998-03-10 00:10:24
5499467593711998-03-10 00:35:51
61742675100741998-03-10 00:25:22
49225675110141998-03-10 00:33:49
50692675125511998-03-10 00:35:51
74202675162851998-03-10 00:30:37
47866675165351998-03-10 00:31:53
\n", "
" ], "text/plain": [ " UserID MovieID Rating Datetime\n", "81098 675 86 4 1998-03-10 00:26:14\n", "90696 675 223 1 1998-03-10 00:35:51\n", "92650 675 235 1 1998-03-10 00:35:51\n", "95459 675 242 4 1998-03-10 00:08:42\n", "82845 675 244 3 1998-03-10 00:29:35\n", "53293 675 258 3 1998-03-10 00:11:19\n", "97286 675 269 5 1998-03-10 00:08:07\n", "93720 675 272 3 1998-03-10 00:07:11\n", "73389 675 286 4 1998-03-10 00:07:11\n", "77524 675 303 5 1998-03-10 00:08:42\n", "47367 675 305 4 1998-03-10 00:09:08\n", "44300 675 306 5 1998-03-10 00:08:07\n", "53730 675 311 3 1998-03-10 00:10:47\n", "54284 675 312 2 1998-03-10 00:10:24\n", "63291 675 318 5 1998-03-10 00:21:13\n", "87082 675 321 2 1998-03-10 00:11:48\n", "56108 675 344 4 1998-03-10 00:12:34\n", "53046 675 347 4 1998-03-10 00:07:11\n", "94617 675 427 5 1998-03-10 00:28:11\n", "69915 675 463 5 1998-03-10 00:16:43\n", "46744 675 509 5 1998-03-10 00:24:25\n", "46598 675 531 5 1998-03-10 00:18:28\n", "52962 675 650 5 1998-03-10 00:32:51\n", "94029 675 750 4 1998-03-10 00:08:07\n", "53223 675 874 4 1998-03-10 00:11:19\n", "62277 675 891 2 1998-03-10 00:12:59\n", "77274 675 896 5 1998-03-10 00:09:35\n", "66194 675 900 4 1998-03-10 00:10:24\n", "54994 675 937 1 1998-03-10 00:35:51\n", "61742 675 1007 4 1998-03-10 00:25:22\n", "49225 675 1101 4 1998-03-10 00:33:49\n", "50692 675 1255 1 1998-03-10 00:35:51\n", "74202 675 1628 5 1998-03-10 00:30:37\n", "47866 675 1653 5 1998-03-10 00:31:53" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# As you can see, there are no review for most movies, such as #213\n", "all_ratings[all_ratings[\"UserID\"] == 675].sort_values(\"MovieID\") " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserIDMovieIDRatingDatetimeFavorable
106225721997-11-12 22:07:14False
11286101451997-11-17 15:38:45True
1220022251997-10-05 09:05:40True
132104031998-03-27 21:59:54False
142242931998-02-21 23:40:57False
\n", "
" ], "text/plain": [ " UserID MovieID Rating Datetime Favorable\n", "10 62 257 2 1997-11-12 22:07:14 False\n", "11 286 1014 5 1997-11-17 15:38:45 True\n", "12 200 222 5 1997-10-05 09:05:40 True\n", "13 210 40 3 1998-03-27 21:59:54 False\n", "14 224 29 3 1998-02-21 23:40:57 False" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Not all reviews are favourable! Our goal is \"other recommended books\", so we only want favourable reviews\n", "all_ratings[\"Favorable\"] = all_ratings[\"Rating\"] > 3\n", "all_ratings[10:15]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserIDMovieIDRatingDatetimeFavorable
20216141997-11-03 07:33:40True
305118931998-03-01 06:15:28False
33313341997-11-03 07:38:19True
334116041997-09-24 03:42:27True
47812041998-02-14 04:51:23True
\n", "
" ], "text/plain": [ " UserID MovieID Rating Datetime Favorable\n", "202 1 61 4 1997-11-03 07:33:40 True\n", "305 1 189 3 1998-03-01 06:15:28 False\n", "333 1 33 4 1997-11-03 07:38:19 True\n", "334 1 160 4 1997-09-24 03:42:27 True\n", "478 1 20 4 1998-02-14 04:51:23 True" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_ratings[all_ratings[\"UserID\"] == 1][:5]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Sample the dataset. You can try increasing the size of the sample, but the run time will be considerably longer\n", "ratings = all_ratings[all_ratings['UserID'].isin(range(200))] # & ratings[\"UserID\"].isin(range(100))]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserIDMovieIDRatingDatetimeFavorable
1612238751997-11-11 17:47:39True
2011939241998-01-30 16:13:34True
2116748641998-04-16 14:54:12True
26389551998-04-13 01:14:54True
286327741997-10-01 23:10:01True
\n", "
" ], "text/plain": [ " UserID MovieID Rating Datetime Favorable\n", "16 122 387 5 1997-11-11 17:47:39 True\n", "20 119 392 4 1998-01-30 16:13:34 True\n", "21 167 486 4 1998-04-16 14:54:12 True\n", "26 38 95 5 1998-04-13 01:14:54 True\n", "28 63 277 4 1997-10-01 23:10:01 True" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We start by creating a dataset of each user's favourable reviews\n", "favorable_ratings = ratings[ratings[\"Favorable\"]]\n", "favorable_ratings[:5]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "199" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We are only interested in the reviewers who have more than one review\n", "favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby(\"UserID\")[\"MovieID\"])\n", "len(favorable_reviews_by_users)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Favorable
MovieID
50100.0
10089.0
25883.0
18179.0
17474.0
\n", "
" ], "text/plain": [ " Favorable\n", "MovieID \n", "50 100.0\n", "100 89.0\n", "258 83.0\n", "181 79.0\n", "174 74.0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Find out how many movies have favourable ratings\n", "num_favorable_by_movie = ratings[[\"MovieID\", \"Favorable\"]].groupby(\"MovieID\").sum()\n", "num_favorable_by_movie.sort_values(\"Favorable\", ascending=False)[:5]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):\n", " counts = defaultdict(int)\n", " for user, reviews in favorable_reviews_by_users.items():\n", " for itemset in k_1_itemsets:\n", " if itemset.issubset(reviews):\n", " for other_reviewed_movie in reviews - itemset:\n", " current_superset = itemset | frozenset((other_reviewed_movie,))\n", " counts[current_superset] += 1\n", " return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 16 movies with more than 50 favorable reviews\n", "I found 93 frequent itemsets of length 2\n", "I found 295 frequent itemsets of length 3\n", "I found 593 frequent itemsets of length 4\n", "I found 785 frequent itemsets of length 5\n", "I found 677 frequent itemsets of length 6\n", "I found 373 frequent itemsets of length 7\n", "I found 126 frequent itemsets of length 8\n", "I found 24 frequent itemsets of length 9\n", "I found 2 frequent itemsets of length 10\n", "Did not find any frequent itemsets of length 11\n" ] } ], "source": [ "import sys\n", "frequent_itemsets = {} # itemsets are sorted by length\n", "min_support = 50\n", "\n", "# k=1 candidates are the isbns with more than min_support favourable reviews\n", "frequent_itemsets[1] = dict((frozenset((movie_id,)), row[\"Favorable\"])\n", " for movie_id, row in num_favorable_by_movie.iterrows()\n", " if row[\"Favorable\"] > min_support)\n", "\n", "print(\"There are {} movies with more than {} favorable reviews\".format(len(frequent_itemsets[1]), min_support))\n", "sys.stdout.flush()\n", "for k in range(2, 20):\n", " # Generate candidates of length k, using the frequent itemsets of length k-1\n", " # Only store the frequent itemsets\n", " cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],\n", " min_support)\n", " if len(cur_frequent_itemsets) == 0:\n", " print(\"Did not find any frequent itemsets of length {}\".format(k))\n", " sys.stdout.flush()\n", " break\n", " else:\n", " print(\"I found {} frequent itemsets of length {}\".format(len(cur_frequent_itemsets), k))\n", " #print(cur_frequent_itemsets)\n", " sys.stdout.flush()\n", " frequent_itemsets[k] = cur_frequent_itemsets\n", "# We aren't interested in the itemsets of length 1, so remove those\n", "del frequent_itemsets[1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found a total of 2968 frequent itemsets\n" ] } ], "source": [ "print(\"Found a total of {0} frequent itemsets\".format(sum(len(itemsets) for itemsets in frequent_itemsets.values())))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "There are 15285 candidate rules\n" ] } ], "source": [ "# Now we create the association rules. First, they are candidates until the confidence has been tested\n", "candidate_rules = []\n", "for itemset_length, itemset_counts in frequent_itemsets.items():\n", " for itemset in itemset_counts.keys():\n", " for conclusion in itemset:\n", " premise = itemset - set((conclusion,))\n", " candidate_rules.append((premise, conclusion))\n", "print(\"There are {} candidate rules\".format(len(candidate_rules)))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(frozenset({79}), 258), (frozenset({258}), 79), (frozenset({50}), 64), (frozenset({64}), 50), (frozenset({127}), 181)]\n" ] } ], "source": [ "print(candidate_rules[:5])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1\n", "correct_counts = defaultdict(int)\n", "incorrect_counts = defaultdict(int)\n", "for user, reviews in favorable_reviews_by_users.items():\n", " for candidate_rule in candidate_rules:\n", " premise, conclusion = candidate_rule\n", " if premise.issubset(reviews):\n", " if conclusion in reviews:\n", " correct_counts[candidate_rule] += 1\n", " else:\n", " incorrect_counts[candidate_rule] += 1\n", "rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])\n", " for candidate_rule in candidate_rules}" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# Choose only rules above a minimum confidence level\n", "min_confidence = 0.9" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5152\n" ] } ], "source": [ "# Filter out the rules with poor confidence\n", "rule_confidence = {rule: confidence for rule, confidence in rule_confidence.items() if confidence > min_confidence}\n", "print(len(rule_confidence))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "from operator import itemgetter\n", "sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rule #1\n", "Rule: 评论了 frozenset({64, 98, 56, 50, 7}) 的人,他也会评论 174\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #2\n", "Rule: 评论了 frozenset({98, 100, 172, 79, 50, 56}) 的人,他也会评论 7\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #3\n", "Rule: 评论了 frozenset({98, 172, 181, 174, 7}) 的人,他也会评论 50\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #4\n", "Rule: 评论了 frozenset({64, 98, 100, 7, 172, 50}) 的人,他也会评论 174\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #5\n", "Rule: 评论了 frozenset({64, 1, 7, 172, 79, 50}) 的人,他也会评论 181\n", " - 置信度Confidence: 1.000\n", "\n" ] } ], "source": [ "for index in range(5):\n", " print(\"Rule #{0}\".format(index + 1))\n", " (premise, conclusion) = sorted_confidence[index][0]\n", " print(\"Rule: 评论了 {0} 的人,他也会评论 {1}\".format(premise, conclusion))\n", " print(\" - 置信度Confidence: {0:.3f}\".format(rule_confidence[(premise, conclusion)]))\n", " print(\"\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Even better, we can get the movie titles themselves from the dataset\n", "movie_name_filename = os.path.join(data_folder, \"u.item\")\n", "movie_name_data = pd.read_csv(movie_name_filename, delimiter=\"|\", header=None, encoding = \"mac-roman\")\n", "movie_name_data.columns = [\"MovieID\", \"Title\", \"Release Date\", \"Video Release\", \"IMDB\", \"\", \"Action\", \"Adventure\",\n", " \"Animation\", \"Children's\", \"Comedy\", \"Crime\", \"Documentary\", \"Drama\", \"Fantasy\", \"Film-Noir\",\n", " \"Horror\", \"Musical\", \"Mystery\", \"Romance\", \"Sci-Fi\", \"Thriller\", \"War\", \"Western\"]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def get_movie_name(movie_id):\n", " title_object = movie_name_data[movie_name_data[\"MovieID\"] == movie_id][\"Title\"]\n", " title = title_object.values[0]\n", " return title" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Get Shorty (1995)'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_movie_name(4)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rule #1\n", "Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) 的人,他也会评论 Raiders of the Lost Ark (1981)\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #2\n", "Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) 的人,他也会评论 Twelve Monkeys (1995)\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #3\n", "Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人,他也会评论 Star Wars (1977)\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #4\n", "Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) 的人,他也会评论 Raiders of the Lost Ark (1981)\n", " - 置信度Confidence: 1.000\n", "\n", "Rule #5\n", "Rule: 评论了 Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) 的人,他也会评论 Return of the Jedi (1983)\n", " - 置信度Confidence: 1.000\n", "\n" ] } ], "source": [ "for index in range(5):\n", " print(\"Rule #{0}\".format(index + 1))\n", " (premise, conclusion) = sorted_confidence[index][0]\n", " premise_names = \", \".join(get_movie_name(idx) for idx in premise)\n", " conclusion_name = get_movie_name(conclusion)\n", " print(\"Rule: 评论了 {0} 的人,他也会评论 {1}\".format(premise_names, conclusion_name))\n", " print(\" - 置信度Confidence: {0:.3f}\".format(rule_confidence[(premise, conclusion)]))\n", " print(\"\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Evaluation using test data\n", "test_dataset = all_ratings[~all_ratings['UserID'].isin(range(200))]\n", "test_favorable = test_dataset[test_dataset[\"Favorable\"]]\n", "#test_not_favourable = test_dataset[~test_dataset[\"Favourable\"]]\n", "test_favorable_by_users = dict((k, frozenset(v.values)) for k, v in test_favorable.groupby(\"UserID\")[\"MovieID\"])\n", "#test_not_favourable_by_users = dict((k, frozenset(v.values)) for k, v in test_not_favourable.groupby(\"UserID\")[\"MovieID\"])\n", "#test_users = test_dataset[\"UserID\"].unique()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserIDMovieIDRatingDatetimeFavorable
32445121997-11-27 05:02:03False
529847441998-01-07 14:20:06True
725346551998-04-03 18:34:27True
830545131998-02-01 09:20:17False
11286101451997-11-17 15:38:45True
\n", "
" ], "text/plain": [ " UserID MovieID Rating Datetime Favorable\n", "3 244 51 2 1997-11-27 05:02:03 False\n", "5 298 474 4 1998-01-07 14:20:06 True\n", "7 253 465 5 1998-04-03 18:34:27 True\n", "8 305 451 3 1998-02-01 09:20:17 False\n", "11 286 1014 5 1997-11-17 15:38:45 True" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_dataset[:5]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "correct_counts = defaultdict(int)\n", "incorrect_counts = defaultdict(int)\n", "for user, reviews in test_favorable_by_users.items():\n", " for candidate_rule in candidate_rules:\n", " premise, conclusion = candidate_rule\n", " if premise.issubset(reviews):\n", " if conclusion in reviews:\n", " correct_counts[candidate_rule] += 1\n", " else:\n", " incorrect_counts[candidate_rule] += 1" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5152\n" ] } ], "source": [ "test_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])\n", " for candidate_rule in rule_confidence}\n", "print(len(test_confidence))" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[((frozenset({64, 1, 7, 172, 79, 50}), 174), 1.0), ((frozenset({64, 258, 98, 7, 174, 181}), 172), 1.0), ((frozenset({64, 1, 98, 7, 79, 181, 56}), 174), 1.0), ((frozenset({64, 1, 98, 7, 172, 79, 181}), 174), 1.0), ((frozenset({64, 258, 98, 7, 174, 50, 181}), 172), 1.0)]\n" ] } ], "source": [ "sorted_test_confidence = sorted(test_confidence.items(), key=itemgetter(1), reverse=True)\n", "print(sorted_test_confidence[:5])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rule #1\n", "Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Pulp Fiction (1994), Star Wars (1977), Twelve Monkeys (1995) 的人,他也会评论 Raiders of the Lost Ark (1981)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.909\n", "\n", "Rule #2\n", "Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977), Pulp Fiction (1994) 的人,他也会评论 Twelve Monkeys (1995)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.609\n", "\n", "Rule #3\n", "Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人,他也会评论 Star Wars (1977)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.946\n", "\n", "Rule #4\n", "Rule: 评论了 Shawshank Redemption, The (1994), Silence of the Lambs, The (1991), Fargo (1996), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Star Wars (1977) 的人,他也会评论 Raiders of the Lost Ark (1981)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.971\n", "\n", "Rule #5\n", "Rule: 评论了 Shawshank Redemption, The (1994), Toy Story (1995), Twelve Monkeys (1995), Empire Strikes Back, The (1980), Fugitive, The (1993), Star Wars (1977) 的人,他也会评论 Return of the Jedi (1983)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.900\n", "\n", "Rule #6\n", "Rule: 评论了 Toy Story (1995), Silence of the Lambs, The (1991), Fargo (1996), Raiders of the Lost Ark (1981), Godfather, The (1972) 的人,他也会评论 Pulp Fiction (1994)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.750\n", "\n", "Rule #7\n", "Rule: 评论了 Silence of the Lambs, The (1991), Empire Strikes Back, The (1980), Godfather, The (1972), Raiders of the Lost Ark (1981), Twelve Monkeys (1995) 的人,他也会评论 Shawshank Redemption, The (1994)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.854\n", "\n", "Rule #8\n", "Rule: 评论了 Pulp Fiction (1994), Toy Story (1995), Shawshank Redemption, The (1994), Godfather, The (1972) 的人,他也会评论 Silence of the Lambs, The (1991)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.870\n", "\n", "Rule #9\n", "Rule: 评论了 Shawshank Redemption, The (1994), Fargo (1996), Return of the Jedi (1983), Raiders of the Lost Ark (1981), Fugitive, The (1993) 的人,他也会评论 Pulp Fiction (1994)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.756\n", "\n", "Rule #10\n", "Rule: 评论了 Silence of the Lambs, The (1991), Fargo (1996), Empire Strikes Back, The (1980), Raiders of the Lost Ark (1981), Fugitive, The (1993), Star Wars (1977), Return of the Jedi (1983) 的人,他也会评论 Pulp Fiction (1994)\n", " - 训练集上的置信度: 1.000\n", " - 测试集上的置信度: 0.756\n", "\n" ] } ], "source": [ "for index in range(10):\n", " print(\"Rule #{0}\".format(index + 1))\n", " (premise, conclusion) = sorted_confidence[index][0]\n", " premise_names = \", \".join(get_movie_name(idx) for idx in premise)\n", " conclusion_name = get_movie_name(conclusion)\n", " print(\"Rule: 评论了 {0} 的人,他也会评论 {1}\".format(premise_names, conclusion_name))\n", " print(\" - 训练集上的置信度: {0:.3f}\".format(rule_confidence.get((premise, conclusion), -1)))\n", " print(\" - 测试集上的置信度: {0:.3f}\".format(test_confidence.get((premise, conclusion), -1)))\n", " print(\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" }, "toc": { "colors": { "hover_highlight": "#DAA520", "navigate_num": "#000000", "navigate_text": "#333333", "running_highlight": "#FF0000", "selected_highlight": "#FFD700", "sidebar_border": "#EEEEEE", "wrapper_background": "#FFFFFF" }, "moveMenuLeft": true, "nav_menu": { "height": "12px", "width": "252px" }, "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 4, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false, "widenNotebook": false } }, "nbformat": 4, "nbformat_minor": 2 }