{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Importing Libraries and Loading Data " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Reading movies file\n", "movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenres
0Toy StoryAdventure|Animation|Children|Comedy|Fantasy
1JumanjiAdventure|Children|Fantasy
2Grumpier Old MenComedy|Romance
3Waiting to ExhaleComedy|Drama|Romance
4Father of the Bride Part IIComedy
\n", "
" ], "text/plain": [ " title genres\n", "0 Toy Story Adventure|Animation|Children|Comedy|Fantasy\n", "1 Jumanji Adventure|Children|Fantasy\n", "2 Grumpier Old Men Comedy|Romance\n", "3 Waiting to Exhale Comedy|Drama|Romance\n", "4 Father of the Bride Part II Comedy" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Break up the big genre string into a string array\n", "movies['genres'] = movies['genres'].str.split('|')\n", "# Convert genres to string value\n", "movies['genres'] = movies['genres'].fillna(\"\").astype('str')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Recommendation based on Genre" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(9742, 177)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')\n", "tfidf_matrix = tf.fit_transform(movies['genres'])\n", "tfidf_matrix.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0.31379419, 0.0611029 , 0.05271111],\n", " [0.31379419, 1. , 0. , 0. ],\n", " [0.0611029 , 0. , 1. , 0.35172407],\n", " [0.05271111, 0. , 0.35172407, 1. ]])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics.pairwise import cosine_similarity\n", "cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)\n", "cosine_sim[:4, :4]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Build a 1-dimensional array with movie titles\n", "titles = movies['title']\n", "indices = pd.Series(movies.index, index=movies['title'])\n", "\n", "# Function that get movie recommendations based on the cosine similarity score of movie genres\n", "def genre_recommendations(title):\n", " idx = indices[title]\n", " sim_scores = list(enumerate(cosine_sim[idx]))\n", " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", " sim_scores = sim_scores[1:21]\n", " movie_indices = [i[0] for i in sim_scores]\n", " return titles.iloc[movie_indices]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8387 Need for Speed \n", "8149 Grandmaster, The (Yi dai zong shi) \n", "123 Apollo 13 \n", "8026 Life of Pi \n", "8396 Noah \n", "38 Dead Presidents \n", "341 Bad Company \n", "347 Faster Pussycat! Kill! Kill! \n", "430 Menace II Society \n", "568 Substitute, The \n", "665 Nothing to Lose \n", "1645 Untouchables, The \n", "1696 Monument Ave. \n", "2563 Death Wish \n", "2574 Band of the Hand \n", "3037 Foxy Brown \n", "3124 Harley Davidson and the Marlboro Man \n", "3167 Scarface \n", "3217 Swordfish \n", "3301 Above the Law \n", "Name: title, dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genre_recommendations('Dark Knight ').head(20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Recommendation based on Title" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(9742, 20558)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')\n", "tfidf_matrix = tf.fit_transform(movies['title'])\n", "tfidf_matrix.shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1., 0., 0., 0.],\n", " [0., 1., 0., 0.],\n", " [0., 0., 1., 0.],\n", " [0., 0., 0., 1.]])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics.pairwise import cosine_similarity\n", "cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)\n", "cosine_sim[:4, :4]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Build a 1-dimensional array with movie titles\n", "titles = movies['title']\n", "indices = pd.Series(movies.index, index=movies['title'])\n", "\n", "# Function that get movie recommendations based on the cosine similarity score of movie genres\n", "def title_recommendations(title):\n", " idx = indices[title]\n", " sim_scores = list(enumerate(cosine_sim[idx]))\n", " sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)\n", " sim_scores = sim_scores[1:21]\n", " movie_indices = [i[0] for i in sim_scores]\n", " return titles.iloc[movie_indices]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7768 Dark Knight Rises, The \n", "8032 Batman: The Dark Knight Returns, Part 1 \n", "8080 Batman: The Dark Knight Returns, Part 2 \n", "140 First Knight \n", "2417 Cry in the Dark, A \n", "5778 Alone in the Dark \n", "7375 Knight and Day \n", "3576 Black Knight \n", "3190 Knight's Tale, A \n", "6858 Alone in the Dark II \n", "4242 Dark Blue \n", "5060 Dark Days \n", "1305 Dark City \n", "5483 Dark Star \n", "6815 Batman: Gotham Knight \n", "5934 Dark Water \n", "4749 Shot in the Dark, A \n", "7877 Dark Shadows \n", "8766 The Dark Valley \n", "6690 Taxi to the Dark Side \n", "Name: title, dtype: object" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "title_recommendations('Dark Knight ').head(20)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }